clang 19.0.0git
Macros | Functions
avx2intrin.h File Reference

Go to the source code of this file.

Macros

#define __DEFAULT_FN_ATTRS256
 
#define __DEFAULT_FN_ATTRS128
 
#define _mm256_mpsadbw_epu8(X, Y, M)
 Computes sixteen sum of absolute difference (SAD) operations on sets of four unsigned 8-bit integers from the 256-bit integer vectors X and Y.
 
#define _mm256_alignr_epi8(a, b, n)
 Uses the lower half of the 256-bit vector a as the upper half of a temporary 256-bit value, and the lower half of the 256-bit vector b as the lower half of the temporary value.
 
#define _mm256_blend_epi16(V1, V2, M)
 Merges 16-bit integer values from either of the two 256-bit vectors V1 or V2, as specified by the immediate integer operand M, and returns the resulting 256-bit vector of [16 x i16].
 
#define _mm256_shuffle_epi32(a, imm)    ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
 Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in a according to control information in the integer literal imm, and returns the 256-bit result.
 
#define _mm256_shufflehi_epi16(a, imm)    ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
 Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in a according to control information in the integer literal imm, and returns the 256-bit result.
 
#define _mm256_shufflelo_epi16(a, imm)    ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
 Shuffles 16-bit integers from the 256-bit vector of [16 x i16] a according to control information in the integer literal imm, and returns the 256-bit [16 x i16] result.
 
#define _mm256_slli_si256(a, imm)    ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
 Shifts each 128-bit half of the 256-bit integer vector a left by imm bytes, shifting in zero bytes, and returns the result.
 
#define _mm256_bslli_epi128(a, imm)    ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
 Shifts each 128-bit half of the 256-bit integer vector a left by imm bytes, shifting in zero bytes, and returns the result.
 
#define _mm256_srli_si256(a, imm)    ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
 Shifts each 128-bit half of the 256-bit integer vector in a right by imm bytes, shifting in zero bytes, and returns the result.
 
#define _mm256_bsrli_epi128(a, imm)    ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
 Shifts each 128-bit half of the 256-bit integer vector in a right by imm bytes, shifting in zero bytes, and returns the result.
 
#define _mm_broadcastsi128_si256(X)   _mm256_broadcastsi128_si256(X)
 
#define _mm_blend_epi32(V1, V2, M)
 Merges 32-bit integer elements from either of the two 128-bit vectors of [4 x i32] in V1 or V2 to the result's 128-bit vector of [4 x i32], as specified by the immediate integer operand M.
 
#define _mm256_blend_epi32(V1, V2, M)
 Merges 32-bit integer elements from either of the two 256-bit vectors of [8 x i32] in V1 or V2 to return a 256-bit vector of [8 x i32], as specified by the immediate integer operand M.
 
#define _mm256_permute4x64_pd(V, M)    ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
 Sets the result's 256-bit vector of [4 x double] to copies of elements of the 256-bit vector of [4 x double] in V as specified by the immediate value M.
 
#define _mm256_permute4x64_epi64(V, M)    ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
 Sets the result's 256-bit vector of [4 x i64] result to copies of elements of the 256-bit vector of [4 x i64] in V as specified by the immediate value M.
 
#define _mm256_permute2x128_si256(V1, V2, M)    ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
 Sets each half of the 256-bit result either to zero or to one of the four possible 128-bit halves of the 256-bit vectors V1 and V2, as specified by the immediate value M.
 
#define _mm256_extracti128_si256(V, M)    ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
 Extracts half of the 256-bit vector V to the 128-bit result.
 
#define _mm256_inserti128_si256(V1, V2, M)
 Copies the 256-bit vector V1 to the result, then overwrites half of the result with the 128-bit vector V2.
 
#define _mm_mask_i32gather_pd(a, m, i, mask, s)
 Conditionally gathers two 64-bit floating-point values, either from the 128-bit vector of [2 x double] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
 
#define _mm256_mask_i32gather_pd(a, m, i, mask, s)
 Conditionally gathers four 64-bit floating-point values, either from the 256-bit vector of [4 x double] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
 
#define _mm_mask_i64gather_pd(a, m, i, mask, s)
 Conditionally gathers two 64-bit floating-point values, either from the 128-bit vector of [2 x double] in a, or from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.
 
#define _mm256_mask_i64gather_pd(a, m, i, mask, s)
 Conditionally gathers four 64-bit floating-point values, either from the 256-bit vector of [4 x double] in a, or from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.
 
#define _mm_mask_i32gather_ps(a, m, i, mask, s)
 Conditionally gathers four 32-bit floating-point values, either from the 128-bit vector of [4 x float] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
 
#define _mm256_mask_i32gather_ps(a, m, i, mask, s)
 Conditionally gathers eight 32-bit floating-point values, either from the 256-bit vector of [8 x float] in a, or from memory m using scaled indexes from the 256-bit vector of [8 x i32] in i.
 
#define _mm_mask_i64gather_ps(a, m, i, mask, s)
 Conditionally gathers two 32-bit floating-point values, either from the 128-bit vector of [4 x float] in a, or from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.
 
#define _mm256_mask_i64gather_ps(a, m, i, mask, s)
 Conditionally gathers four 32-bit floating-point values, either from the 128-bit vector of [4 x float] in a, or from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.
 
#define _mm_mask_i32gather_epi32(a, m, i, mask, s)
 Conditionally gathers four 32-bit integer values, either from the 128-bit vector of [4 x i32] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
 
#define _mm256_mask_i32gather_epi32(a, m, i, mask, s)
 Conditionally gathers eight 32-bit integer values, either from the 256-bit vector of [8 x i32] in a, or from memory m using scaled indexes from the 256-bit vector of [8 x i32] in i.
 
#define _mm_mask_i64gather_epi32(a, m, i, mask, s)
 Conditionally gathers two 32-bit integer values, either from the 128-bit vector of [4 x i32] in a, or from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.
 
#define _mm256_mask_i64gather_epi32(a, m, i, mask, s)
 Conditionally gathers four 32-bit integer values, either from the 128-bit vector of [4 x i32] in a, or from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.
 
#define _mm_mask_i32gather_epi64(a, m, i, mask, s)
 Conditionally gathers two 64-bit integer values, either from the 128-bit vector of [2 x i64] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
 
#define _mm256_mask_i32gather_epi64(a, m, i, mask, s)
 Conditionally gathers four 64-bit integer values, either from the 256-bit vector of [4 x i64] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
 
#define _mm_mask_i64gather_epi64(a, m, i, mask, s)
 Conditionally gathers two 64-bit integer values, either from the 128-bit vector of [2 x i64] in a, or from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.
 
#define _mm256_mask_i64gather_epi64(a, m, i, mask, s)
 Conditionally gathers four 64-bit integer values, either from the 256-bit vector of [4 x i64] in a, or from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.
 
#define _mm_i32gather_pd(m, i, s)
 Gathers two 64-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
 
#define _mm256_i32gather_pd(m, i, s)
 Gathers four 64-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
 
#define _mm_i64gather_pd(m, i, s)
 Gathers two 64-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.
 
#define _mm256_i64gather_pd(m, i, s)
 Gathers four 64-bit floating-point values from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.
 
#define _mm_i32gather_ps(m, i, s)
 Gathers four 32-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
 
#define _mm256_i32gather_ps(m, i, s)
 Gathers eight 32-bit floating-point values from memory m using scaled indexes from the 256-bit vector of [8 x i32] in i.
 
#define _mm_i64gather_ps(m, i, s)
 Gathers two 32-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.
 
#define _mm256_i64gather_ps(m, i, s)
 Gathers four 32-bit floating-point values from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.
 
#define _mm_i32gather_epi32(m, i, s)
 Gathers four 32-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
 
#define _mm256_i32gather_epi32(m, i, s)
 Gathers eight 32-bit floating-point values from memory m using scaled indexes from the 256-bit vector of [8 x i32] in i.
 
#define _mm_i64gather_epi32(m, i, s)
 Gathers two 32-bit integer values from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.
 
#define _mm256_i64gather_epi32(m, i, s)
 Gathers four 32-bit integer values from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.
 
#define _mm_i32gather_epi64(m, i, s)
 Gathers two 64-bit integer values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
 
#define _mm256_i32gather_epi64(m, i, s)
 Gathers four 64-bit integer values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
 
#define _mm_i64gather_epi64(m, i, s)
 Gathers two 64-bit integer values from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.
 
#define _mm256_i64gather_epi64(m, i, s)
 Gathers four 64-bit integer values from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.
 

Functions

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi8 (__m256i __a)
 Computes the absolute value of each signed byte in the 256-bit integer vector __a and returns each value in the corresponding byte of the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi16 (__m256i __a)
 Computes the absolute value of each signed 16-bit element in the 256-bit vector of [16 x i16] in __a and returns each value in the corresponding element of the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi32 (__m256i __a)
 Computes the absolute value of each signed 32-bit element in the 256-bit vector of [8 x i32] in __a and returns each value in the corresponding element of the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packs_epi16 (__m256i __a, __m256i __b)
 Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit integers using signed saturation, and returns the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packs_epi32 (__m256i __a, __m256i __b)
 Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit integers using signed saturation, and returns the resulting 256-bit vector of [16 x i16].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packus_epi16 (__m256i __a, __m256i __b)
 Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers using unsigned saturation, and returns the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packus_epi32 (__m256i __V1, __m256i __V2)
 Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers using unsigned saturation, and returns the resulting 256-bit vector of [16 x i16].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi8 (__m256i __a, __m256i __b)
 Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors and returns the lower 8 bits of each sum in the corresponding byte of the 256-bit integer vector result (overflow is ignored).
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi16 (__m256i __a, __m256i __b)
 Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] and returns the lower 16 bits of each sum in the corresponding element of the [16 x i16] result (overflow is ignored).
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi32 (__m256i __a, __m256i __b)
 Adds 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32] and returns the lower 32 bits of each sum in the corresponding element of the [8 x i32] result (overflow is ignored).
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi64 (__m256i __a, __m256i __b)
 Adds 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64] and returns the lower 64 bits of each sum in the corresponding element of the [4 x i64] result (overflow is ignored).
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epi8 (__m256i __a, __m256i __b)
 Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturation, and returns each sum in the corresponding byte of the 256-bit integer vector result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epi16 (__m256i __a, __m256i __b)
 Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using signed saturation, and returns the [16 x i16] result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epu8 (__m256i __a, __m256i __b)
 Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned saturation, and returns each sum in the corresponding byte of the 256-bit integer vector result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epu16 (__m256i __a, __m256i __b)
 Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_and_si256 (__m256i __a, __m256i __b)
 Computes the bitwise AND of the 256-bit integer vectors in __a and __b.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_andnot_si256 (__m256i __a, __m256i __b)
 Computes the bitwise AND of the 256-bit integer vector in __b with the bitwise NOT of the 256-bit integer vector in __a.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu8 (__m256i __a, __m256i __b)
 Computes the averages of the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and returns each average in the corresponding byte of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu16 (__m256i __a, __m256i __b)
 Computes the averages of the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns each average in the corresponding element of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_blendv_epi8 (__m256i __V1, __m256i __V2, __m256i __M)
 Merges 8-bit integer values from either of the two 256-bit vectors __V1 or __V2, as specified by the 256-bit mask __M and returns the resulting 256-bit integer vector.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi8 (__m256i __a, __m256i __b)
 Compares corresponding bytes in the 256-bit integer vectors in __a and __b for equality and returns the outcomes in the corresponding bytes of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi16 (__m256i __a, __m256i __b)
 Compares corresponding elements in the 256-bit vectors of [16 x i16] in __a and __b for equality and returns the outcomes in the corresponding elements of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi32 (__m256i __a, __m256i __b)
 Compares corresponding elements in the 256-bit vectors of [8 x i32] in __a and __b for equality and returns the outcomes in the corresponding elements of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi64 (__m256i __a, __m256i __b)
 Compares corresponding elements in the 256-bit vectors of [4 x i64] in __a and __b for equality and returns the outcomes in the corresponding elements of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi8 (__m256i __a, __m256i __b)
 Compares corresponding signed bytes in the 256-bit integer vectors in __a and __b for greater-than and returns the outcomes in the corresponding bytes of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi16 (__m256i __a, __m256i __b)
 Compares corresponding signed elements in the 256-bit vectors of [16 x i16] in __a and __b for greater-than and returns the outcomes in the corresponding elements of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi32 (__m256i __a, __m256i __b)
 Compares corresponding signed elements in the 256-bit vectors of [8 x i32] in __a and __b for greater-than and returns the outcomes in the corresponding elements of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi64 (__m256i __a, __m256i __b)
 Compares corresponding signed elements in the 256-bit vectors of [4 x i64] in __a and __b for greater-than and returns the outcomes in the corresponding elements of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi16 (__m256i __a, __m256i __b)
 Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and returns the lower 16 bits of each sum in an element of the [16 x i16] result (overflow is ignored).
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi32 (__m256i __a, __m256i __b)
 Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and returns the lower 32 bits of each sum in an element of the [8 x i32] result (overflow is ignored).
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadds_epi16 (__m256i __a, __m256i __b)
 Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using signed saturation and returns each sum in an element of the [16 x i16] result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi16 (__m256i __a, __m256i __b)
 Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and returns the lower 16 bits of each difference in an element of the [16 x i16] result (overflow is ignored).
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi32 (__m256i __a, __m256i __b)
 Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and returns the lower 32 bits of each difference in an element of the [8 x i32] result (overflow is ignored).
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsubs_epi16 (__m256i __a, __m256i __b)
 Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using signed saturation and returns each sum in an element of the [16 x i16] result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maddubs_epi16 (__m256i __a, __m256i __b)
 Multiplies each unsigned byte from the 256-bit integer vector in __a with the corresponding signed byte from the 256-bit integer vector in __b, forming signed 16-bit intermediate products.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_madd_epi16 (__m256i __a, __m256i __b)
 Multiplies corresponding 16-bit elements of two 256-bit vectors of [16 x i16], forming 32-bit intermediate products, and adds pairs of those products to form 32-bit sums returned as elements of the [8 x i32] result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi8 (__m256i __a, __m256i __b)
 Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns the larger of each pair in the corresponding byte of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi16 (__m256i __a, __m256i __b)
 Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns the larger of each pair in the corresponding element of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi32 (__m256i __a, __m256i __b)
 Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and __b and returns the larger of each pair in the corresponding element of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu8 (__m256i __a, __m256i __b)
 Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and returns the larger of each pair in the corresponding byte of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu16 (__m256i __a, __m256i __b)
 Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns the larger of each pair in the corresponding element of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu32 (__m256i __a, __m256i __b)
 Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and __b and returns the larger of each pair in the corresponding element of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi8 (__m256i __a, __m256i __b)
 Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns the smaller of each pair in the corresponding byte of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi16 (__m256i __a, __m256i __b)
 Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns the smaller of each pair in the corresponding element of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi32 (__m256i __a, __m256i __b)
 Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and __b and returns the smaller of each pair in the corresponding element of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu8 (__m256i __a, __m256i __b)
 Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and returns the smaller of each pair in the corresponding byte of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu16 (__m256i __a, __m256i __b)
 Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns the smaller of each pair in the corresponding element of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu32 (__m256i __a, __m256i __b)
 Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and __b and returns the smaller of each pair in the corresponding element of the 256-bit result.
 
static __inline__ int __DEFAULT_FN_ATTRS256 _mm256_movemask_epi8 (__m256i __a)
 Creates a 32-bit integer mask from the most significant bit of each byte in the 256-bit integer vector in __a and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi8_epi16 (__m128i __V)
 Sign-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corresponding elements of a 256-bit vector of [16 x i16].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi8_epi32 (__m128i __V)
 Sign-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit values in the corresponding elements of a 256-bit vector of [8 x i32].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi8_epi64 (__m128i __V)
 Sign-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi16_epi32 (__m128i __V)
 Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit values in the corresponding elements of a 256-bit vector of [8 x i32].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi16_epi64 (__m128i __V)
 Sign-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi32_epi64 (__m128i __V)
 Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu8_epi16 (__m128i __V)
 Zero-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corresponding elements of a 256-bit vector of [16 x i16].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu8_epi32 (__m128i __V)
 Zero-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit values in the corresponding elements of a 256-bit vector of [8 x i32].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu8_epi64 (__m128i __V)
 Zero-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu16_epi32 (__m128i __V)
 Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit values in the corresponding elements of a 256-bit vector of [8 x i32].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu16_epi64 (__m128i __V)
 Zero-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu32_epi64 (__m128i __V)
 Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mul_epi32 (__m256i __a, __m256i __b)
 Multiplies signed 32-bit integers from even-numbered elements of two 256-bit vectors of [8 x i32] and returns the 64-bit products in the [4 x i64] result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhrs_epi16 (__m256i __a, __m256i __b)
 Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], truncates the 32-bit results to the most significant 18 bits, rounds by adding 1, and returns bits [16:1] of each rounded product in the [16 x i16] result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhi_epu16 (__m256i __a, __m256i __b)
 Multiplies unsigned 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upper 16 bits of each 32-bit product in the [16 x i16] result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhi_epi16 (__m256i __a, __m256i __b)
 Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upper 16 bits of each 32-bit product in the [16 x i16] result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mullo_epi16 (__m256i __a, __m256i __b)
 Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the lower 16 bits of each 32-bit product in the [16 x i16] result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mullo_epi32 (__m256i __a, __m256i __b)
 Multiplies signed 32-bit integer elements of two 256-bit vectors of [8 x i32], and returns the lower 32 bits of each 64-bit product in the [8 x i32] result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mul_epu32 (__m256i __a, __m256i __b)
 Multiplies unsigned 32-bit integers from even-numered elements of two 256-bit vectors of [8 x i32] and returns the 64-bit products in the [4 x i64] result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_or_si256 (__m256i __a, __m256i __b)
 Computes the bitwise OR of the 256-bit integer vectors in __a and __b.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sad_epu8 (__m256i __a, __m256i __b)
 Computes four sum of absolute difference (SAD) operations on sets of eight unsigned 8-bit integers from the 256-bit integer vectors __a and __b.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shuffle_epi8 (__m256i __a, __m256i __b)
 Shuffles 8-bit integers in the 256-bit integer vector __a according to control information in the 256-bit integer vector __b, and returns the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi8 (__m256i __a, __m256i __b)
 Sets each byte of the result to the corresponding byte of the 256-bit integer vector in __a, the negative of that byte, or zero, depending on whether the corresponding byte of the 256-bit integer vector in __b is greater than zero, less than zero, or equal to zero, respectively.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi16 (__m256i __a, __m256i __b)
 Sets each element of the result to the corresponding element of the 256-bit vector of [16 x i16] in __a, the negative of that element, or zero, depending on whether the corresponding element of the 256-bit vector of [16 x i16] in __b is greater than zero, less than zero, or equal to zero, respectively.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi32 (__m256i __a, __m256i __b)
 Sets each element of the result to the corresponding element of the 256-bit vector of [8 x i32] in __a, the negative of that element, or zero, depending on whether the corresponding element of the 256-bit vector of [8 x i32] in __b is greater than zero, less than zero, or equal to zero, respectively.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi16 (__m256i __a, int __count)
 Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by __count bits, shifting in zero bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi16 (__m256i __a, __m128i __count)
 Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by the number of bits specified by the lower 64 bits of __count, shifting in zero bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi32 (__m256i __a, int __count)
 Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by __count bits, shifting in zero bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi32 (__m256i __a, __m128i __count)
 Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi64 (__m256i __a, int __count)
 Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by __count bits, shifting in zero bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi64 (__m256i __a, __m128i __count)
 Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi16 (__m256i __a, int __count)
 Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits, shifting in sign bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi16 (__m256i __a, __m128i __count)
 Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits given in the lower 64 bits of __count, shifting in sign bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi32 (__m256i __a, int __count)
 Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits, shifting in sign bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi32 (__m256i __a, __m128i __count)
 Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits given in the lower 64 bits of __count, shifting in sign bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi16 (__m256i __a, int __count)
 Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits, shifting in zero bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi16 (__m256i __a, __m128i __count)
 Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi32 (__m256i __a, int __count)
 Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits, shifting in zero bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi32 (__m256i __a, __m128i __count)
 Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi64 (__m256i __a, int __count)
 Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by __count bits, shifting in zero bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi64 (__m256i __a, __m128i __count)
 Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi8 (__m256i __a, __m256i __b)
 Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi16 (__m256i __a, __m256i __b)
 Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi32 (__m256i __a, __m256i __b)
 Subtracts 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi64 (__m256i __a, __m256i __b)
 Subtracts 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epi8 (__m256i __a, __m256i __b)
 Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturation, and returns each differences in the corresponding byte of the 256-bit integer vector result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epi16 (__m256i __a, __m256i __b)
 Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using signed saturation, and returns each difference in the corresponding element of the [16 x i16] result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epu8 (__m256i __a, __m256i __b)
 Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned saturation, and returns each difference in the corresponding byte of the 256-bit integer vector result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epu16 (__m256i __a, __m256i __b)
 Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsigned saturation, and returns each difference in the corresponding element of the [16 x i16] result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi8 (__m256i __a, __m256i __b)
 Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to form the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi16 (__m256i __a, __m256i __b)
 Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __b to return the resulting 256-bit vector of [16 x i16].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi32 (__m256i __a, __m256i __b)
 Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b to return the resulting 256-bit vector of [8 x i32].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi64 (__m256i __a, __m256i __b)
 Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b to return the resulting 256-bit vector of [4 x i64].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi8 (__m256i __a, __m256i __b)
 Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to form the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi16 (__m256i __a, __m256i __b)
 Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __b to return the resulting 256-bit vector of [16 x i16].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi32 (__m256i __a, __m256i __b)
 Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b to return the resulting 256-bit vector of [8 x i32].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi64 (__m256i __a, __m256i __b)
 Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b to return the resulting 256-bit vector of [4 x i64].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_xor_si256 (__m256i __a, __m256i __b)
 Computes the bitwise XOR of the 256-bit integer vectors in __a and __b.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_stream_load_si256 (const void *__V)
 Loads the 256-bit integer vector from memory __V using a non-temporal memory hint and returns the vector.
 
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_broadcastss_ps (__m128 __X)
 Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] in __X to all elements of the result's 128-bit vector of [4 x float].
 
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_broadcastsd_pd (__m128d __a)
 Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double] in __a to both elements of the result's 128-bit vector of [2 x double].
 
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_broadcastss_ps (__m128 __X)
 Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] in __X to all elements of the result's 256-bit vector of [8 x float].
 
static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_broadcastsd_pd (__m128d __X)
 Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double] in __X to all elements of the result's 256-bit vector of [4 x double].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastsi128_si256 (__m128i __X)
 Broadcasts the 128-bit integer data from __X to both the lower and upper halves of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastb_epi8 (__m128i __X)
 Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 256-bit result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastw_epi16 (__m128i __X)
 Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result's 256-bit vector of [16 x i16].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastd_epi32 (__m128i __X)
 Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result's 256-bit vector of [8 x i32].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastq_epi64 (__m128i __X)
 Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to all elements of the result's 256-bit vector of [4 x i64].
 
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastb_epi8 (__m128i __X)
 Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 128-bit result.
 
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastw_epi16 (__m128i __X)
 Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result's 128-bit vector of [8 x i16].
 
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastd_epi32 (__m128i __X)
 Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result's vector of [4 x i32].
 
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastq_epi64 (__m128i __X)
 Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to both elements of the result's 128-bit vector of [2 x i64].
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_epi32 (__m256i __a, __m256i __b)
 Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 256-bit vector of [8 x i32] in __a as specified by indexes in the elements of the 256-bit vector of [8 x i32] in __b.
 
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_ps (__m256 __a, __m256i __b)
 Sets the result's 256-bit vector of [8 x float] to copies of elements of the 256-bit vector of [8 x float] in __a as specified by indexes in the elements of the 256-bit vector of [8 x i32] in __b.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi32 (int const *__X, __m256i __M)
 Conditionally loads eight 32-bit integer elements from memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, sets that element of the result to zero.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi64 (long long const *__X, __m256i __M)
 Conditionally loads four 64-bit integer elements from memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, sets that element of the result to zero.
 
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi32 (int const *__X, __m128i __M)
 Conditionally loads four 32-bit integer elements from memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, sets that element of the result to zero.
 
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi64 (long long const *__X, __m128i __M)
 Conditionally loads two 64-bit integer elements from memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, sets that element of the result to zero.
 
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y)
 Conditionally stores eight 32-bit integer elements from the 256-bit vector of [8 x i32] in __Y to memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, the memory element is unchanged.
 
static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y)
 Conditionally stores four 64-bit integer elements from the 256-bit vector of [4 x i64] in __Y to memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, the memory element is unchanged.
 
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y)
 Conditionally stores four 32-bit integer elements from the 128-bit vector of [4 x i32] in __Y to memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, the memory element is unchanged.
 
static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y)
 Conditionally stores two 64-bit integer elements from the 128-bit vector of [2 x i64] in __Y to memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, the memory element is unchanged.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sllv_epi32 (__m256i __X, __m256i __Y)
 Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X left by the number of bits given in the corresponding element of the 256-bit vector of [8 x i32] in __Y, shifting in zero bits, and returns the result.
 
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_sllv_epi32 (__m128i __X, __m128i __Y)
 Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X left by the number of bits given in the corresponding element of the 128-bit vector of [4 x i32] in __Y, shifting in zero bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sllv_epi64 (__m256i __X, __m256i __Y)
 Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X left by the number of bits given in the corresponding element of the 128-bit vector of [4 x i64] in __Y, shifting in zero bits, and returns the result.
 
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_sllv_epi64 (__m128i __X, __m128i __Y)
 Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X left by the number of bits given in the corresponding element of the 128-bit vector of [2 x i64] in __Y, shifting in zero bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srav_epi32 (__m256i __X, __m256i __Y)
 Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits given in the corresponding element of the 256-bit vector of [8 x i32] in __Y, shifting in sign bits, and returns the result.
 
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srav_epi32 (__m128i __X, __m128i __Y)
 Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits given in the corresponding element of the 128-bit vector of [4 x i32] in __Y, shifting in sign bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srlv_epi32 (__m256i __X, __m256i __Y)
 Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits given in the corresponding element of the 256-bit vector of [8 x i32] in __Y, shifting in zero bits, and returns the result.
 
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srlv_epi32 (__m128i __X, __m128i __Y)
 Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits given in the corresponding element of the 128-bit vector of [4 x i32] in __Y, shifting in zero bits, and returns the result.
 
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srlv_epi64 (__m256i __X, __m256i __Y)
 Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X right by the number of bits given in the corresponding element of the 128-bit vector of [4 x i64] in __Y, shifting in zero bits, and returns the result.
 
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srlv_epi64 (__m128i __X, __m128i __Y)
 Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X right by the number of bits given in the corresponding element of the 128-bit vector of [2 x i64] in __Y, shifting in zero bits, and returns the result.
 

Macro Definition Documentation

◆ __DEFAULT_FN_ATTRS128

#define __DEFAULT_FN_ATTRS128
Value:
__attribute__((__always_inline__, __nodebug__, \
__target__("avx2,no-evex512"), __min_vector_width__(128)))
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.

Definition at line 21 of file avx2intrin.h.

◆ __DEFAULT_FN_ATTRS256

#define __DEFAULT_FN_ATTRS256
Value:
__attribute__((__always_inline__, __nodebug__, \
__target__("avx2,no-evex512"), __min_vector_width__(256)))

Definition at line 18 of file avx2intrin.h.

◆ _mm256_alignr_epi8

#define _mm256_alignr_epi8 (   a,
  b,
 
)
Value:
((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
(__v32qi)(__m256i)(b), (n)))
__device__ __2f16 b

Uses the lower half of the 256-bit vector a as the upper half of a temporary 256-bit value, and the lower half of the 256-bit vector b as the lower half of the temporary value.

Right-shifts the temporary value by n bytes, and uses the lower 16 bytes of the shifted value as the lower 16 bytes of the result. Uses the upper halves of a and b to make another temporary value, right shifts by n, and uses the lower 16 bytes of the shifted value as the upper 16 bytes of the result.

__m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
#define _mm256_alignr_epi8(a, b, n)
Uses the lower half of the 256-bit vector a as the upper half of a temporary 256-bit value,...
Definition: avx2intrin.h:438

This intrinsic corresponds to the VPALIGNR instruction.

Parameters
aA 256-bit integer vector containing source values.
bA 256-bit integer vector containing source values.
nAn immediate value specifying the number of bytes to shift.
Returns
A 256-bit integer vector containing the result.

Definition at line 438 of file avx2intrin.h.

◆ _mm256_blend_epi16

#define _mm256_blend_epi16 (   V1,
  V2,
 
)
Value:
((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
(__v16hi)(__m256i)(V2), (int)(M)))

Merges 16-bit integer values from either of the two 256-bit vectors V1 or V2, as specified by the immediate integer operand M, and returns the resulting 256-bit vector of [16 x i16].

FOR i := 0 TO 7
j := i*16
IF M[i] == 0
result[7+j:j] := V1[7+j:j]
result[135+j:128+j] := V1[135+j:128+j]
ELSE
result[7+j:j] := V2[7+j:j]
result[135+j:128+j] := V2[135+j:128+j]
FI
ENDFOR
__m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
#define _mm256_blend_epi16(V1, V2, M)
Merges 16-bit integer values from either of the two 256-bit vectors V1 or V2, as specified by the imm...
Definition: avx2intrin.h:603

This intrinsic corresponds to the VPBLENDW instruction.

Parameters
V1A 256-bit vector of [16 x i16] containing source values.
V2A 256-bit vector of [16 x i16] containing source values.
MAn immediate 8-bit integer operand, with bits [7:0] specifying the source for each element of the result. The position of the mask bit corresponds to the index of a copied value. When a mask bit is 0, the element is copied from V1; otherwise, it is copied from V2. M[0] determines the source for elements 0 and 8, M[1] for elements 1 and 9, and so forth.
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 603 of file avx2intrin.h.

◆ _mm256_blend_epi32

#define _mm256_blend_epi32 (   V1,
  V2,
 
)
Value:
((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
(__v8si)(__m256i)(V2), (int)(M)))

Merges 32-bit integer elements from either of the two 256-bit vectors of [8 x i32] in V1 or V2 to return a 256-bit vector of [8 x i32], as specified by the immediate integer operand M.

FOR i := 0 TO 7
j := i*32
IF M[i] == 0
result[31+j:j] := V1[31+j:j]
ELSE
result[31+j:j] := V2[32+j:j]
FI
ENDFOR
__m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
#define _mm256_blend_epi32(V1, V2, M)
Merges 32-bit integer elements from either of the two 256-bit vectors of [8 x i32] in V1 or V2 to ret...
Definition: avx2intrin.h:3148

This intrinsic corresponds to the VPBLENDDD instruction.

Parameters
V1A 256-bit vector of [8 x i32] containing source values.
V2A 256-bit vector of [8 x i32] containing source values.
MAn immediate 8-bit integer operand, with bits [7:0] specifying the source for each element of the result. The position of the mask bit corresponds to the index of a copied value. When a mask bit is 0, the element is copied from V1; otherwise, it is is copied from V2.
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 3148 of file avx2intrin.h.

◆ _mm256_bslli_epi128

#define _mm256_bslli_epi128 (   a,
  imm 
)     ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))

Shifts each 128-bit half of the 256-bit integer vector a left by imm bytes, shifting in zero bytes, and returns the result.

If imm is greater than 15, the returned result is all zeroes.

__m256i _mm256_bslli_epi128(__m256i a, const int imm);
#define _mm256_bslli_epi128(a, imm)
Shifts each 128-bit half of the 256-bit integer vector a left by imm bytes, shifting in zero bytes,...
Definition: avx2intrin.h:2112

This intrinsic corresponds to the VPSLLDQ instruction.

Parameters
aA 256-bit integer vector to be shifted.
immAn unsigned immediate value specifying the shift count (in bytes).
Returns
A 256-bit integer vector containing the result.

Definition at line 2112 of file avx2intrin.h.

◆ _mm256_bsrli_epi128

#define _mm256_bsrli_epi128 (   a,
  imm 
)     ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))

Shifts each 128-bit half of the 256-bit integer vector in a right by imm bytes, shifting in zero bytes, and returns the result.

If imm is greater than 15, the returned result is all zeroes.

__m256i _mm256_bsrli_epi128(__m256i a, const int imm);
#define _mm256_bsrli_epi128(a, imm)
Shifts each 128-bit half of the 256-bit integer vector in a right by imm bytes, shifting in zero byte...
Definition: avx2intrin.h:2356

This intrinsic corresponds to the VPSRLDQ instruction.

Parameters
aA 256-bit integer vector to be shifted.
immAn unsigned immediate value specifying the shift count (in bytes).
Returns
A 256-bit integer vector containing the result.

Definition at line 2356 of file avx2intrin.h.

◆ _mm256_extracti128_si256

#define _mm256_extracti128_si256 (   V,
 
)     ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))

Extracts half of the 256-bit vector V to the 128-bit result.

If bit 0 of the immediate M is zero, extracts the lower half of the result; otherwise, extracts the upper half.

__m128i _mm256_extracti128_si256(__m256i V, const int M);
#define V(N, I)
Definition: ASTContext.h:3273
#define _mm256_extracti128_si256(V, M)
Extracts half of the 256-bit vector V to the 128-bit result.
Definition: avx2intrin.h:3459

This intrinsic corresponds to the VEXTRACTI128 instruction.

Parameters
VA 256-bit integer vector containing the source values.
MAn immediate value specifying which half of V to extract.
Returns
A 128-bit integer vector containing the result.

Definition at line 3459 of file avx2intrin.h.

◆ _mm256_i32gather_epi32

#define _mm256_i32gather_epi32 (   m,
  i,
  s 
)
Value:
((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
(int const *)(m), (__v8si)(__m256i)(i), \
(__v8si)_mm256_set1_epi32(-1), (s)))
__device__ __2f16 float __ockl_bool s
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void)
Create a 256-bit integer vector with undefined values.
Definition: avxintrin.h:3666
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi32(int __i)
Constructs a 256-bit integer vector of [8 x i32], with each of the 32-bit integral vector elements se...
Definition: avxintrin.h:4254

Gathers eight 32-bit floating-point values from memory m using scaled indexes from the 256-bit vector of [8 x i32] in i.

FOR element := 0 to 7
j := element*32
k := element*32
result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
ENDFOR
__m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
#define _mm256_i32gather_epi32(m, i, s)
Gathers eight 32-bit floating-point values from memory m using scaled indexes from the 256-bit vector...
Definition: avx2intrin.h:5077

This intrinsic corresponds to the VPGATHERDD instruction.

Parameters
mA pointer to the memory used for loading values.
iA 256-bit vector of [8 x i32] containing signed indexes into m.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 256-bit vector of [8 x i32] containing the gathered values.

Definition at line 5077 of file avx2intrin.h.

◆ _mm256_i32gather_epi64

#define _mm256_i32gather_epi64 (   m,
  i,
  s 
)
Value:
((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
(long long const *)(m), \
(__v4si)(__m128i)(i), \
(__v4di)_mm256_set1_epi64x(-1), (s)))
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi64x(long long __q)
Constructs a 256-bit integer vector of [4 x i64], with each of the 64-bit integral vector elements se...
Definition: avxintrin.h:4311

Gathers four 64-bit integer values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.

FOR element := 0 to 3
j := element*64
k := element*32
result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
ENDFOR
__m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
#define _mm256_i32gather_epi64(m, i, s)
Gathers four 64-bit integer values from memory m using scaled indexes from the 128-bit vector of [4 x...
Definition: avx2intrin.h:5209

This intrinsic corresponds to the VPGATHERDQ instruction.

Parameters
mA pointer to the memory used for loading values.
iA 128-bit vector of [4 x i32] containing signed indexes into m.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 256-bit vector of [4 x i64] containing the gathered values.

Definition at line 5209 of file avx2intrin.h.

◆ _mm256_i32gather_pd

#define _mm256_i32gather_pd (   m,
  i,
  s 
)
Value:
((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
(double const *)(m), \
(__v4si)(__m128i)(i), \
(s)))
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void)
Create a 256-bit vector of [4 x double] with undefined values.
Definition: avxintrin.h:3640
#define _mm256_cmp_pd(a, b, c)
Compares each of the corresponding double-precision values of two 256-bit vectors of [4 x double],...
Definition: avxintrin.h:1785
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setzero_pd(void)
Constructs a 256-bit floating-point vector of [4 x double] with all vector elements initialized to ze...
Definition: avxintrin.h:4326
#define _CMP_EQ_OQ
Definition: xmmintrin.h:3051

Gathers four 64-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.

FOR element := 0 to 3
j := element*64
k := element*32
result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
ENDFOR
__m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
#define _mm256_i32gather_pd(m, i, s)
Gathers four 64-bit floating-point values from memory m using scaled indexes from the 128-bit vector ...
Definition: avx2intrin.h:4795

This intrinsic corresponds to the VGATHERDPD instruction.

Parameters
mA pointer to the memory used for loading values.
iA 128-bit vector of [4 x i32] containing signed indexes into m.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 256-bit vector of [4 x double] containing the gathered values.

Definition at line 4795 of file avx2intrin.h.

◆ _mm256_i32gather_ps

#define _mm256_i32gather_ps (   m,
  i,
  s 
)
Value:
((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
(float const *)(m), \
(__v8si)(__m256i)(i), \
(s)))
#define _mm256_cmp_ps(a, b, c)
Compares each of the corresponding values of two 256-bit vectors of [8 x float], using the operation ...
Definition: avxintrin.h:1845
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
Definition: avxintrin.h:4340
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void)
Create a 256-bit vector of [8 x float] with undefined values.
Definition: avxintrin.h:3653

Gathers eight 32-bit floating-point values from memory m using scaled indexes from the 256-bit vector of [8 x i32] in i.

FOR element := 0 to 7
j := element*32
k := element*32
result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
ENDFOR
__m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
#define _mm256_i32gather_ps(m, i, s)
Gathers eight 32-bit floating-point values from memory m using scaled indexes from the 256-bit vector...
Definition: avx2intrin.h:4937

This intrinsic corresponds to the VGATHERDPS instruction.

Parameters
mA pointer to the memory used for loading values.
iA 256-bit vector of [8 x i32] containing signed indexes into m.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 256-bit vector of [8 x float] containing the gathered values.

Definition at line 4937 of file avx2intrin.h.

◆ _mm256_i64gather_epi32

#define _mm256_i64gather_epi32 (   m,
  i,
  s 
)
Value:
((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
(int const *)(m), (__v4di)(__m256i)(i), \
(__v4si)_mm_set1_epi32(-1), (s)))
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3477
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
Definition: emmintrin.h:3689

Gathers four 32-bit integer values from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.

FOR element := 0 to 3
j := element*32
k := element*64
result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
ENDFOR
__m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
#define _mm256_i64gather_epi32(m, i, s)
Gathers four 32-bit integer values from memory m using scaled indexes from the 256-bit vector of [4 x...
Definition: avx2intrin.h:5143

This intrinsic corresponds to the VPGATHERQD instruction.

Parameters
mA pointer to the memory used for loading values.
iA 256-bit vector of [4 x i64] containing signed indexes into m.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [4 x i32] containing the gathered values.

Definition at line 5143 of file avx2intrin.h.

◆ _mm256_i64gather_epi64

#define _mm256_i64gather_epi64 (   m,
  i,
  s 
)
Value:
((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
(long long const *)(m), \
(__v4di)(__m256i)(i), \
(__v4di)_mm256_set1_epi64x(-1), (s)))

Gathers four 64-bit integer values from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.

FOR element := 0 to 3
j := element*64
k := element*64
result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
ENDFOR
__m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
#define _mm256_i64gather_epi64(m, i, s)
Gathers four 64-bit integer values from memory m using scaled indexes from the 256-bit vector of [4 x...
Definition: avx2intrin.h:5275

This intrinsic corresponds to the VPGATHERQQ instruction.

Parameters
mA pointer to the memory used for loading values.
iA 256-bit vector of [4 x i64] containing signed indexes into m.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 256-bit vector of [4 x i64] containing the gathered values.

Definition at line 5275 of file avx2intrin.h.

◆ _mm256_i64gather_pd

#define _mm256_i64gather_pd (   m,
  i,
  s 
)
Value:
((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
(double const *)(m), \
(__v4di)(__m256i)(i), \
(s)))

Gathers four 64-bit floating-point values from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.

FOR element := 0 to 3
j := element*64
k := element*64
result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
ENDFOR
__m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
#define _mm256_i64gather_pd(m, i, s)
Gathers four 64-bit floating-point values from memory m using scaled indexes from the 256-bit vector ...
Definition: avx2intrin.h:4866

This intrinsic corresponds to the VGATHERQPD instruction.

Parameters
mA pointer to the memory used for loading values.
iA 256-bit vector of [4 x i64] containing signed indexes into m.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 256-bit vector of [4 x double] containing the gathered values.

Definition at line 4866 of file avx2intrin.h.

◆ _mm256_i64gather_ps

#define _mm256_i64gather_ps (   m,
  i,
  s 
)
Value:
((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
(float const *)(m), \
(__v4di)(__m256i)(i), \
(s)))
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition: xmmintrin.h:525
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:2018
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition: xmmintrin.h:1891

Gathers four 32-bit floating-point values from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.

FOR element := 0 to 3
j := element*32
k := element*64
result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
ENDFOR
__m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
#define _mm256_i64gather_ps(m, i, s)
Gathers four 32-bit floating-point values from memory m using scaled indexes from the 256-bit vector ...
Definition: avx2intrin.h:5010

This intrinsic corresponds to the VGATHERQPS instruction.

Parameters
mA pointer to the memory used for loading values.
iA 256-bit vector of [4 x i64] containing signed indexes into m.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [4 x float] containing the gathered values.

Definition at line 5010 of file avx2intrin.h.

◆ _mm256_inserti128_si256

#define _mm256_inserti128_si256 (   V1,
  V2,
 
)
Value:
((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
(__v2di)(__m128i)(V2), (int)(M)))

Copies the 256-bit vector V1 to the result, then overwrites half of the result with the 128-bit vector V2.

If bit 0 of the immediate M is zero, overwrites the lower half of the result; otherwise, overwrites the upper half.

__m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
#define _mm256_inserti128_si256(V1, V2, M)
Copies the 256-bit vector V1 to the result, then overwrites half of the result with the 128-bit vecto...
Definition: avx2intrin.h:3482

This intrinsic corresponds to the VINSERTI128 instruction.

Parameters
V1A 256-bit integer vector containing a source value.
V2A 128-bit integer vector containing a source value.
MAn immediate value specifying where to put V2 in the result.
Returns
A 256-bit integer vector containing the result.

Definition at line 3482 of file avx2intrin.h.

◆ _mm256_mask_i32gather_epi32

#define _mm256_mask_i32gather_epi32 (   a,
  m,
  i,
  mask,
  s 
)
Value:
((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
(int const *)(m), \
(__v8si)(__m256i)(i), \
(__v8si)(__m256i)(mask), (s)))

Conditionally gathers eight 32-bit integer values, either from the 256-bit vector of [8 x i32] in a, or from memory m using scaled indexes from the 256-bit vector of [8 x i32] in i.

The 256-bit vector of [8 x i32] in mask determines the source for each element.

FOR element := 0 to 7
j := element*32
k := element*32
IF mask[j+31] == 0
result[j+31:j] := a[j+31:j]
ELSE
result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
FI
ENDFOR
__m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
__m256i mask, const int s);
#define _mm256_mask_i32gather_epi32(a, m, i, mask, s)
Conditionally gathers eight 32-bit integer values, either from the 256-bit vector of [8 x i32] in a,...
Definition: avx2intrin.h:4434

This intrinsic corresponds to the VPGATHERDD instruction.

Parameters
aA 256-bit vector of [8 x i32] used as the source when a mask bit is zero.
mA pointer to the memory used for loading values.
iA 256-bit vector of [8 x i32] containing signed indexes into m.
maskA 256-bit vector of [8 x i32] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 256-bit vector of [8 x i32] containing the gathered values.

Definition at line 4434 of file avx2intrin.h.

◆ _mm256_mask_i32gather_epi64

#define _mm256_mask_i32gather_epi64 (   a,
  m,
  i,
  mask,
  s 
)
Value:
((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
(long long const *)(m), \
(__v4si)(__m128i)(i), \
(__v4di)(__m256i)(mask), (s)))

Conditionally gathers four 64-bit integer values, either from the 256-bit vector of [4 x i64] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.

The 256-bit vector of [4 x i64] in mask determines the source for each element.

FOR element := 0 to 3
j := element*64
k := element*32
IF mask[j+63] == 0
result[j+63:j] := a[j+63:j]
ELSE
result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
FI
ENDFOR
__m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
__m128i i, __m256i mask, const int s);
#define _mm256_mask_i32gather_epi64(a, m, i, mask, s)
Conditionally gathers four 64-bit integer values, either from the 256-bit vector of [4 x i64] in a,...
Definition: avx2intrin.h:4630

This intrinsic corresponds to the VPGATHERDQ instruction.

Parameters
aA 256-bit vector of [4 x i64] used as the source when a mask bit is zero.
mA pointer to the memory used for loading values.
iA 128-bit vector of [4 x i32] containing signed indexes into m.
maskA 256-bit vector of [4 x i64] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 256-bit vector of [4 x i64] containing the gathered values.

Definition at line 4630 of file avx2intrin.h.

◆ _mm256_mask_i32gather_pd

#define _mm256_mask_i32gather_pd (   a,
  m,
  i,
  mask,
  s 
)
Value:
((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
(double const *)(m), \
(__v4si)(__m128i)(i), \
(__v4df)(__m256d)(mask), (s)))

Conditionally gathers four 64-bit floating-point values, either from the 256-bit vector of [4 x double] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.

The 256-bit vector of [4 x double] in mask determines the source for each element.

FOR element := 0 to 3
j := element*64
k := element*32
IF mask[j+63] == 0
result[j+63:j] := a[j+63:j]
ELSE
result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
FI
ENDFOR
__m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
__m256d mask, const int s);
#define _mm256_mask_i32gather_pd(a, m, i, mask, s)
Conditionally gathers four 64-bit floating-point values, either from the 256-bit vector of [4 x doubl...
Definition: avx2intrin.h:4047

This intrinsic corresponds to the VGATHERDPD instruction.

Parameters
aA 256-bit vector of [4 x double] used as the source when a mask bit is zero.
mA pointer to the memory used for loading values.
iA 128-bit vector of [4 x i32] containing signed indexes into m.
maskA 256-bit vector of [4 x double] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 256-bit vector of [4 x double] containing the gathered values.

Definition at line 4047 of file avx2intrin.h.

◆ _mm256_mask_i32gather_ps

#define _mm256_mask_i32gather_ps (   a,
  m,
  i,
  mask,
  s 
)
Value:
((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
(float const *)(m), \
(__v8si)(__m256i)(i), \
(__v8sf)(__m256)(mask), (s)))

Conditionally gathers eight 32-bit floating-point values, either from the 256-bit vector of [8 x float] in a, or from memory m using scaled indexes from the 256-bit vector of [8 x i32] in i.

The 256-bit vector of [8 x float] in mask determines the source for each element.

FOR element := 0 to 7
j := element*32
k := element*32
IF mask[j+31] == 0
result[j+31:j] := a[j+31:j]
ELSE
result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
FI
ENDFOR
__m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
__m256 mask, const int s);
#define _mm256_mask_i32gather_ps(a, m, i, mask, s)
Conditionally gathers eight 32-bit floating-point values, either from the 256-bit vector of [8 x floa...
Definition: avx2intrin.h:4239

This intrinsic corresponds to the VGATHERDPS instruction.

Parameters
aA 256-bit vector of [8 x float] used as the source when a mask bit is zero.
mA pointer to the memory used for loading values.
iA 256-bit vector of [8 x i32] containing signed indexes into m.
maskA 256-bit vector of [8 x float] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 256-bit vector of [8 x float] containing the gathered values.

Definition at line 4239 of file avx2intrin.h.

◆ _mm256_mask_i64gather_epi32

#define _mm256_mask_i64gather_epi32 (   a,
  m,
  i,
  mask,
  s 
)
Value:
((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
(int const *)(m), \
(__v4di)(__m256i)(i), \
(__v4si)(__m128i)(mask), (s)))

Conditionally gathers four 32-bit integer values, either from the 128-bit vector of [4 x i32] in a, or from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.

The 128-bit vector of [4 x i32] in mask determines the source for each element.

FOR element := 0 to 3
j := element*32
k := element*64
IF mask[j+31] == 0
result[j+31:j] := a[j+31:j]
ELSE
result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
FI
ENDFOR
__m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
__m128i mask, const int s);
#define _mm256_mask_i64gather_epi32(a, m, i, mask, s)
Conditionally gathers four 32-bit integer values, either from the 128-bit vector of [4 x i32] in a,...
Definition: avx2intrin.h:4533

This intrinsic corresponds to the VPGATHERQD instruction.

Parameters
aA 128-bit vector of [4 x i32] used as the source when a mask bit is zero.
mA pointer to the memory used for loading values.
iA 256-bit vector of [4 x i64] containing signed indexes into m.
maskA 128-bit vector of [4 x i32] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [4 x i32] containing the gathered values.

Definition at line 4533 of file avx2intrin.h.

◆ _mm256_mask_i64gather_epi64

#define _mm256_mask_i64gather_epi64 (   a,
  m,
  i,
  mask,
  s 
)
Value:
((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
(long long const *)(m), \
(__v4di)(__m256i)(i), \
(__v4di)(__m256i)(mask), (s)))

Conditionally gathers four 64-bit integer values, either from the 256-bit vector of [4 x i64] in a, or from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.

The 256-bit vector of [4 x i64] in mask determines the source for each element.

FOR element := 0 to 3
j := element*64
k := element*64
IF mask[j+63] == 0
result[j+63:j] := a[j+63:j]
ELSE
result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
FI
ENDFOR
__m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
__m256i i, __m256i mask, const int s);
#define _mm256_mask_i64gather_epi64(a, m, i, mask, s)
Conditionally gathers four 64-bit integer values, either from the 256-bit vector of [4 x i64] in a,...
Definition: avx2intrin.h:4726

This intrinsic corresponds to the VPGATHERQQ instruction.

Parameters
aA 256-bit vector of [4 x i64] used as the source when a mask bit is zero.
mA pointer to the memory used for loading values.
iA 256-bit vector of [4 x i64] containing signed indexes into m.
maskA 256-bit vector of [4 x i64] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 256-bit vector of [4 x i64] containing the gathered values.

Definition at line 4726 of file avx2intrin.h.

◆ _mm256_mask_i64gather_pd

#define _mm256_mask_i64gather_pd (   a,
  m,
  i,
  mask,
  s 
)
Value:
((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
(double const *)(m), \
(__v4di)(__m256i)(i), \
(__v4df)(__m256d)(mask), (s)))

Conditionally gathers four 64-bit floating-point values, either from the 256-bit vector of [4 x double] in a, or from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.

The 256-bit vector of [4 x double] in mask determines the source for each element.

FOR element := 0 to 3
j := element*64
k := element*64
IF mask[j+63] == 0
result[j+63:j] := a[j+63:j]
ELSE
result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
FI
ENDFOR
__m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
__m256d mask, const int s);
#define _mm256_mask_i64gather_pd(a, m, i, mask, s)
Conditionally gathers four 64-bit floating-point values, either from the 256-bit vector of [4 x doubl...
Definition: avx2intrin.h:4143

This intrinsic corresponds to the VGATHERQPD instruction.

Parameters
aA 256-bit vector of [4 x double] used as the source when a mask bit is zero.
mA pointer to the memory used for loading values.
iA 256-bit vector of [4 x i64] containing signed indexes into m.
maskA 256-bit vector of [4 x double] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 256-bit vector of [4 x double] containing the gathered values.

Definition at line 4143 of file avx2intrin.h.

◆ _mm256_mask_i64gather_ps

#define _mm256_mask_i64gather_ps (   a,
  m,
  i,
  mask,
  s 
)
Value:
((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
(float const *)(m), \
(__v4di)(__m256i)(i), \
(__v4sf)(__m128)(mask), (s)))

Conditionally gathers four 32-bit floating-point values, either from the 128-bit vector of [4 x float] in a, or from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.

The 128-bit vector of [4 x float] in mask determines the source for each element.

FOR element := 0 to 3
j := element*32
k := element*64
IF mask[j+31] == 0
result[j+31:j] := a[j+31:j]
ELSE
result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
FI
ENDFOR
__m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
__m128 mask, const int s);
#define _mm256_mask_i64gather_ps(a, m, i, mask, s)
Conditionally gathers four 32-bit floating-point values, either from the 128-bit vector of [4 x float...
Definition: avx2intrin.h:4338

This intrinsic corresponds to the VGATHERQPS instruction.

Parameters
aA 128-bit vector of [4 x float] used as the source when a mask bit is zero.
mA pointer to the memory used for loading values.
iA 256-bit vector of [4 x i64] containing signed indexes into m.
maskA 128-bit vector of [4 x float] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [4 x float] containing the gathered values.

Definition at line 4338 of file avx2intrin.h.

◆ _mm256_mpsadbw_epu8

#define _mm256_mpsadbw_epu8 (   X,
  Y,
 
)
Value:
((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
(__v32qi)(__m256i)(Y), (int)(M)))
#define X(type, name)
Definition: Value.h:143

Computes sixteen sum of absolute difference (SAD) operations on sets of four unsigned 8-bit integers from the 256-bit integer vectors X and Y.

Eight SAD results are computed using the lower half of the input vectors, and another eight using the upper half. These 16-bit values are returned in the lower and upper halves of the 256-bit result, respectively.

A single SAD operation selects four bytes from X and four bytes from Y as input. It computes the differences between each X byte and the corresponding Y byte, takes the absolute value of each difference, and sums these four values to form one 16-bit result. The intrinsic computes 16 of these results with different sets of input bytes.

For each set of eight results, the SAD operations use the same four bytes from Y; the starting bit position for these four bytes is specified by M[1:0] times 32. The eight operations use successive sets of four bytes from X; the starting bit position for the first set of four bytes is specified by M[2] times 32. These bit positions are all relative to the 128-bit lane for each set of eight operations.

r := 0
FOR i := 0 TO 1
j := i*3
Ybase := M[j+1:j]*32 + i*128
Xbase := M[j+2]*32 + i*128
FOR k := 0 TO 3
temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
result[r+15:r] := temp0 + temp1 + temp2 + temp3
Xbase := Xbase + 8
r := r + 16
ENDFOR
ENDFOR
__m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
#define _mm256_mpsadbw_epu8(X, Y, M)
Computes sixteen sum of absolute difference (SAD) operations on sets of four unsigned 8-bit integers ...
Definition: avx2intrin.h:83

This intrinsic corresponds to the VMPSADBW instruction.

Parameters
XA 256-bit integer vector containing one of the inputs.
YA 256-bit integer vector containing one of the inputs.
MAn unsigned immediate value specifying the starting positions of the bytes to operate on.
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 83 of file avx2intrin.h.

◆ _mm256_permute2x128_si256

#define _mm256_permute2x128_si256 (   V1,
  V2,
 
)     ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))

Sets each half of the 256-bit result either to zero or to one of the four possible 128-bit halves of the 256-bit vectors V1 and V2, as specified by the immediate value M.

FOR i := 0 TO 1
j := i*128
k := M >> (i*4)
IF k[3] == 0
CASE (k[1:0]) OF
0: result[127+j:j] := V1[127:0]
1: result[127+j:j] := V1[255:128]
2: result[127+j:j] := V2[127:0]
3: result[127+j:j] := V2[255:128]
ESAC
ELSE
result[127+j:j] := 0
FI
ENDFOR
__m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
#define _mm256_permute2x128_si256(V1, V2, M)
Sets each half of the 256-bit result either to zero or to one of the four possible 128-bit halves of ...
Definition: avx2intrin.h:3439

This intrinsic corresponds to the VPERM2I128 instruction.

Parameters
V1A 256-bit integer vector containing source values.
V2A 256-bit integer vector containing source values.
MAn immediate value specifying how to form the result. Bits [3:0] control the lower half of the result, bits [7:4] control the upper half. Within each 4-bit control value, if bit 3 is 1, the result is zero, otherwise bits [1:0] determine the source as follows.
0: the lower half of V1
1: the upper half of V1
2: the lower half of V2
3: the upper half of V2
Returns
A 256-bit integer vector containing the result.

Definition at line 3439 of file avx2intrin.h.

◆ _mm256_permute4x64_epi64

#define _mm256_permute4x64_epi64 (   V,
 
)     ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))

Sets the result's 256-bit vector of [4 x i64] result to copies of elements of the 256-bit vector of [4 x i64] in V as specified by the immediate value M.

FOR i := 0 TO 3
j := i*64
k := (M >> i*2)[1:0] * 64
result[j+63:j] := V[k+63:k]
ENDFOR
__m256i _mm256_permute4x64_epi64(__m256i V, const int M);
#define _mm256_permute4x64_epi64(V, M)
Sets the result's 256-bit vector of [4 x i64] result to copies of elements of the 256-bit vector of [...
Definition: avx2intrin.h:3393

This intrinsic corresponds to the VPERMQ instruction.

Parameters
VA 256-bit vector of [4 x i64] containing the source values.
MAn immediate 8-bit value specifying which elements to copy from V. M[1:0] specifies the index in a for element 0 of the result, M[3:2] specifies the index for element 1, and so forth.
Returns
A 256-bit vector of [4 x i64] containing the result.

Definition at line 3393 of file avx2intrin.h.

◆ _mm256_permute4x64_pd

#define _mm256_permute4x64_pd (   V,
 
)     ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))

Sets the result's 256-bit vector of [4 x double] to copies of elements of the 256-bit vector of [4 x double] in V as specified by the immediate value M.

FOR i := 0 TO 3
j := i*64
k := (M >> i*2)[1:0] * 64
result[j+63:j] := V[k+63:k]
ENDFOR
__m256d _mm256_permute4x64_pd(__m256d V, const int M);
#define _mm256_permute4x64_pd(V, M)
Sets the result's 256-bit vector of [4 x double] to copies of elements of the 256-bit vector of [4 x ...
Definition: avx2intrin.h:3335

This intrinsic corresponds to the VPERMPD instruction.

Parameters
VA 256-bit vector of [4 x double] containing the source values.
MAn immediate 8-bit value specifying which elements to copy from V. M[1:0] specifies the index in a for element 0 of the result, M[3:2] specifies the index for element 1, and so forth.
Returns
A 256-bit vector of [4 x double] containing the result.

Definition at line 3335 of file avx2intrin.h.

◆ _mm256_shuffle_epi32

#define _mm256_shuffle_epi32 (   a,
  imm 
)     ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))

Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in a according to control information in the integer literal imm, and returns the 256-bit result.

In effect there are two parallel 128-bit shuffles in the lower and upper halves.

FOR i := 0 to 3
j := i*32
k := (imm >> i*2)[1:0] * 32
result[j+31:j] := a[k+31:k]
result[128+j+31:128+j] := a[128+k+31:128+k]
ENDFOR
__m256i _mm256_shuffle_epi32(__m256i a, const int imm);
#define _mm256_shuffle_epi32(a, imm)
Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in a according to control information i...
Definition: avx2intrin.h:1936

This intrinsic corresponds to the VPSHUFB instruction.

Parameters
aA 256-bit vector of [8 x i32] containing source values.
immAn immediate 8-bit value specifying which elements to copy from a. imm[1:0] specifies the index in a for elements 0 and 4 of the result, imm[3:2] specifies the index for elements 1 and 5, and so forth.
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 1936 of file avx2intrin.h.

◆ _mm256_shufflehi_epi16

#define _mm256_shufflehi_epi16 (   a,
  imm 
)     ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))

Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in a according to control information in the integer literal imm, and returns the 256-bit result.

The upper 64 bits of each 128-bit half are shuffled in parallel; the lower 64 bits of each 128-bit half are copied from a unchanged.

result[63:0] := a[63:0]
result[191:128] := a[191:128]
FOR i := 0 TO 3
j := i * 16 + 64
k := (imm >> i*2)[1:0] * 16 + 64
result[j+15:j] := a[k+15:k]
result[128+j+15:128+j] := a[128+k+15:128+k]
ENDFOR
__m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
#define _mm256_shufflehi_epi16(a, imm)
Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in a according to control information ...
Definition: avx2intrin.h:1972

This intrinsic corresponds to the VPSHUFHW instruction.

Parameters
aA 256-bit vector of [16 x i16] containing source values.
immAn immediate 8-bit value specifying which elements to copy from a. imm[1:0] specifies the index in a for elements 4 and 8 of the result, imm[3:2] specifies the index for elements 5 and 9, and so forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 1972 of file avx2intrin.h.

◆ _mm256_shufflelo_epi16

#define _mm256_shufflelo_epi16 (   a,
  imm 
)     ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))

Shuffles 16-bit integers from the 256-bit vector of [16 x i16] a according to control information in the integer literal imm, and returns the 256-bit [16 x i16] result.

The lower 64 bits of each 128-bit half are shuffled; the upper 64 bits of each 128-bit half are copied from a unchanged.

result[127:64] := a[127:64]
result[255:192] := a[255:192]
FOR i := 0 TO 3
j := i * 16
k := (imm >> i*2)[1:0] * 16
result[j+15:j] := a[k+15:k]
result[128+j+15:128+j] := a[128+k+15:128+k]
ENDFOR
__m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
#define _mm256_shufflelo_epi16(a, imm)
Shuffles 16-bit integers from the 256-bit vector of [16 x i16] a according to control information in ...
Definition: avx2intrin.h:2009

This intrinsic corresponds to the VPSHUFLW instruction.

Parameters
aA 256-bit vector of [16 x i16] to use as a source of data for the result.
immAn immediate 8-bit value specifying which elements to copy from a. imm[1:0] specifies the index in a for elements 0 and 8 of the result, imm[3:2] specifies the index for elements 1 and 9, and so forth.
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 2009 of file avx2intrin.h.

◆ _mm256_slli_si256

#define _mm256_slli_si256 (   a,
  imm 
)     ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))

Shifts each 128-bit half of the 256-bit integer vector a left by imm bytes, shifting in zero bytes, and returns the result.

If imm is greater than 15, the returned result is all zeroes.

__m256i _mm256_slli_si256(__m256i a, const int imm);
#define _mm256_slli_si256(a, imm)
Shifts each 128-bit half of the 256-bit integer vector a left by imm bytes, shifting in zero bytes,...
Definition: avx2intrin.h:2092

This intrinsic corresponds to the VPSLLDQ instruction.

Parameters
aA 256-bit integer vector to be shifted.
immAn unsigned immediate value specifying the shift count (in bytes).
Returns
A 256-bit integer vector containing the result.

Definition at line 2092 of file avx2intrin.h.

◆ _mm256_srli_si256

#define _mm256_srli_si256 (   a,
  imm 
)     ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))

Shifts each 128-bit half of the 256-bit integer vector in a right by imm bytes, shifting in zero bytes, and returns the result.

If imm is greater than 15, the returned result is all zeroes.

__m256i _mm256_srli_si256(__m256i a, const int imm);
#define _mm256_srli_si256(a, imm)
Shifts each 128-bit half of the 256-bit integer vector in a right by imm bytes, shifting in zero byte...
Definition: avx2intrin.h:2336

This intrinsic corresponds to the VPSRLDQ instruction.

Parameters
aA 256-bit integer vector to be shifted.
immAn unsigned immediate value specifying the shift count (in bytes).
Returns
A 256-bit integer vector containing the result.

Definition at line 2336 of file avx2intrin.h.

◆ _mm_blend_epi32

#define _mm_blend_epi32 (   V1,
  V2,
 
)
Value:
((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
(__v4si)(__m128i)(V2), (int)(M)))

Merges 32-bit integer elements from either of the two 128-bit vectors of [4 x i32] in V1 or V2 to the result's 128-bit vector of [4 x i32], as specified by the immediate integer operand M.

FOR i := 0 TO 3
j := i*32
IF M[i] == 0
result[31+j:j] := V1[31+j:j]
ELSE
result[31+j:j] := V2[32+j:j]
FI
ENDFOR
__m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
#define _mm_blend_epi32(V1, V2, M)
Merges 32-bit integer elements from either of the two 128-bit vectors of [4 x i32] in V1 or V2 to the...
Definition: avx2intrin.h:3111

This intrinsic corresponds to the VPBLENDDD instruction.

Parameters
V1A 128-bit vector of [4 x i32] containing source values.
V2A 128-bit vector of [4 x i32] containing source values.
MAn immediate 8-bit integer operand, with bits [3:0] specifying the source for each element of the result. The position of the mask bit corresponds to the index of a copied value. When a mask bit is 0, the element is copied from V1; otherwise, it is copied from V2.
Returns
A 128-bit vector of [4 x i32] containing the result.

Definition at line 3111 of file avx2intrin.h.

◆ _mm_broadcastsi128_si256

#define _mm_broadcastsi128_si256 (   X)    _mm256_broadcastsi128_si256(X)

Definition at line 3076 of file avx2intrin.h.

◆ _mm_i32gather_epi32

#define _mm_i32gather_epi32 (   m,
  i,
  s 
)
Value:
((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
(int const *)(m), (__v4si)(__m128i)(i), \
(__v4si)_mm_set1_epi32(-1), (s)))

Gathers four 32-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.

FOR element := 0 to 3
j := element*32
k := element*32
result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
ENDFOR
__m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
#define _mm_i32gather_epi32(m, i, s)
Gathers four 32-bit floating-point values from memory m using scaled indexes from the 128-bit vector ...
Definition: avx2intrin.h:5045

This intrinsic corresponds to the VPGATHERDD instruction.

Parameters
mA pointer to the memory used for loading values.
iA 128-bit vector of [4 x i32] containing signed indexes into m.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [4 x i32] containing the gathered values.

Definition at line 5045 of file avx2intrin.h.

◆ _mm_i32gather_epi64

#define _mm_i32gather_epi64 (   m,
  i,
  s 
)
Value:
((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
(long long const *)(m), \
(__v4si)(__m128i)(i), \
(__v2di)_mm_set1_epi64x(-1), (s)))
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
Definition: emmintrin.h:3655

Gathers two 64-bit integer values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.

FOR element := 0 to 1
j := element*64
k := element*32
result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
ENDFOR
__m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
#define _mm_i32gather_epi64(m, i, s)
Gathers two 64-bit integer values from memory m using scaled indexes from the 128-bit vector of [4 x ...
Definition: avx2intrin.h:5176

This intrinsic corresponds to the VPGATHERDQ instruction.

Parameters
mA pointer to the memory used for loading values.
iA 128-bit vector of [4 x i32] containing signed indexes into m. Only the first two elements are used.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [2 x i64] containing the gathered values.

Definition at line 5176 of file avx2intrin.h.

◆ _mm_i32gather_pd

#define _mm_i32gather_pd (   m,
  i,
  s 
)
Value:
((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
(double const *)(m), \
(__v4si)(__m128i)(i), \
(s)))
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1755
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1857
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:435

Gathers two 64-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.

FOR element := 0 to 1
j := element*64
k := element*32
result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
ENDFOR
__m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
#define _mm_i32gather_pd(m, i, s)
Gathers two 64-bit floating-point values from memory m using scaled indexes from the 128-bit vector o...
Definition: avx2intrin.h:4760

This intrinsic corresponds to the VGATHERDPD instruction.

Parameters
mA pointer to the memory used for loading values.
iA 128-bit vector of [4 x i32] containing signed indexes into m. Only the first two elements are used.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [2 x double] containing the gathered values.

Definition at line 4760 of file avx2intrin.h.

◆ _mm_i32gather_ps

#define _mm_i32gather_ps (   m,
  i,
  s 
)
Value:
((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
(float const *)(m), \
(__v4si)(__m128i)(i), \
(s)))

Gathers four 32-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.

FOR element := 0 to 3
j := element*32
k := element*32
result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
ENDFOR
__m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
#define _mm_i32gather_ps(m, i, s)
Gathers four 32-bit floating-point values from memory m using scaled indexes from the 128-bit vector ...
Definition: avx2intrin.h:4902

This intrinsic corresponds to the VGATHERDPS instruction.

Parameters
mA pointer to the memory used for loading values.
iA 128-bit vector of [4 x i32] containing signed indexes into m.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [4 x float] containing the gathered values.

Definition at line 4902 of file avx2intrin.h.

◆ _mm_i64gather_epi32

#define _mm_i64gather_epi32 (   m,
  i,
  s 
)
Value:
((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
(int const *)(m), (__v2di)(__m128i)(i), \
(__v4si)_mm_set1_epi32(-1), (s)))

Gathers two 32-bit integer values from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.

The upper two elements of the result are zeroed.

FOR element := 0 to 1
j := element*32
k := element*64
result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
ENDFOR
result[127:64] := 0
__m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
#define _mm_i64gather_epi32(m, i, s)
Gathers two 32-bit integer values from memory m using scaled indexes from the 128-bit vector of [2 x ...
Definition: avx2intrin.h:5111

This intrinsic corresponds to the VPGATHERQD instruction.

Parameters
mA pointer to the memory used for loading values.
iA 128-bit vector of [2 x i64] containing signed indexes into m.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [4 x i32] containing the gathered values.

Definition at line 5111 of file avx2intrin.h.

◆ _mm_i64gather_epi64

#define _mm_i64gather_epi64 (   m,
  i,
  s 
)
Value:
((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
(long long const *)(m), \
(__v2di)(__m128i)(i), \
(__v2di)_mm_set1_epi64x(-1), (s)))

Gathers two 64-bit integer values from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.

FOR element := 0 to 1
j := element*64
k := element*64
result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
ENDFOR
__m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
#define _mm_i64gather_epi64(m, i, s)
Gathers two 64-bit integer values from memory m using scaled indexes from the 128-bit vector of [2 x ...
Definition: avx2intrin.h:5242

This intrinsic corresponds to the VPGATHERQQ instruction.

Parameters
mA pointer to the memory used for loading values.
iA 128-bit vector of [2 x i64] containing signed indexes into m.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [2 x i64] containing the gathered values.

Definition at line 5242 of file avx2intrin.h.

◆ _mm_i64gather_pd

#define _mm_i64gather_pd (   m,
  i,
  s 
)
Value:
((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
(double const *)(m), \
(__v2di)(__m128i)(i), \
(s)))

Gathers two 64-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.

FOR element := 0 to 1
j := element*64
k := element*64
result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
ENDFOR
__m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
#define _mm_i64gather_pd(m, i, s)
Gathers two 64-bit floating-point values from memory m using scaled indexes from the 128-bit vector o...
Definition: avx2intrin.h:4831

This intrinsic corresponds to the VGATHERQPD instruction.

Parameters
mA pointer to the memory used for loading values.
iA 128-bit vector of [2 x i64] containing signed indexes into m.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [2 x double] containing the gathered values.

Definition at line 4831 of file avx2intrin.h.

◆ _mm_i64gather_ps

#define _mm_i64gather_ps (   m,
  i,
  s 
)
Value:
((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
(float const *)(m), \
(__v2di)(__m128i)(i), \
(s)))

Gathers two 32-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.

The upper two elements of the result are zeroed.

FOR element := 0 to 1
j := element*32
k := element*64
result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
ENDFOR
result[127:64] := 0
__m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
#define _mm_i64gather_ps(m, i, s)
Gathers two 32-bit floating-point values from memory m using scaled indexes from the 128-bit vector o...
Definition: avx2intrin.h:4975

This intrinsic corresponds to the VGATHERQPS instruction.

Parameters
mA pointer to the memory used for loading values.
iA 128-bit vector of [2 x i64] containing signed indexes into m.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [4 x float] containing the gathered values.

Definition at line 4975 of file avx2intrin.h.

◆ _mm_mask_i32gather_epi32

#define _mm_mask_i32gather_epi32 (   a,
  m,
  i,
  mask,
  s 
)
Value:
((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
(int const *)(m), \
(__v4si)(__m128i)(i), \
(__v4si)(__m128i)(mask), (s)))

Conditionally gathers four 32-bit integer values, either from the 128-bit vector of [4 x i32] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.

The 128-bit vector of [4 x i32] in mask determines the source for each element.

FOR element := 0 to 3
j := element*32
k := element*32
IF mask[j+31] == 0
result[j+31:j] := a[j+31:j]
ELSE
result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
FI
ENDFOR
__m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
__m128i mask, const int s);
#define _mm_mask_i32gather_epi32(a, m, i, mask, s)
Conditionally gathers four 32-bit integer values, either from the 128-bit vector of [4 x i32] in a,...
Definition: avx2intrin.h:4386

This intrinsic corresponds to the VPGATHERDD instruction.

Parameters
aA 128-bit vector of [4 x i32] used as the source when a mask bit is zero.
mA pointer to the memory used for loading values.
iA 128-bit vector of [4 x i32] containing signed indexes into m.
maskA 128-bit vector of [4 x i32] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [4 x i32] containing the gathered values.

Definition at line 4386 of file avx2intrin.h.

◆ _mm_mask_i32gather_epi64

#define _mm_mask_i32gather_epi64 (   a,
  m,
  i,
  mask,
  s 
)
Value:
((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
(long long const *)(m), \
(__v4si)(__m128i)(i), \
(__v2di)(__m128i)(mask), (s)))

Conditionally gathers two 64-bit integer values, either from the 128-bit vector of [2 x i64] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.

The 128-bit vector of [2 x i64] in mask determines the source for each element.

FOR element := 0 to 1
j := element*64
k := element*32
IF mask[j+63] == 0
result[j+63:j] := a[j+63:j]
ELSE
result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
FI
ENDFOR
__m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
__m128i mask, const int s);
#define _mm_mask_i32gather_epi64(a, m, i, mask, s)
Conditionally gathers two 64-bit integer values, either from the 128-bit vector of [2 x i64] in a,...
Definition: avx2intrin.h:4582

This intrinsic corresponds to the VPGATHERDQ instruction.

Parameters
aA 128-bit vector of [2 x i64] used as the source when a mask bit is zero.
mA pointer to the memory used for loading values.
iA 128-bit vector of [4 x i32] containing signed indexes into m. Only the first two elements are used.
maskA 128-bit vector of [2 x i64] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [2 x i64] containing the gathered values.

Definition at line 4582 of file avx2intrin.h.

◆ _mm_mask_i32gather_pd

#define _mm_mask_i32gather_pd (   a,
  m,
  i,
  mask,
  s 
)
Value:
((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
(double const *)(m), \
(__v4si)(__m128i)(i), \
(__v2df)(__m128d)(mask), (s)))

Conditionally gathers two 64-bit floating-point values, either from the 128-bit vector of [2 x double] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.

The 128-bit vector of [2 x double] in mask determines the source for each element.

FOR element := 0 to 1
j := element*64
k := element*32
IF mask[j+63] == 0
result[j+63:j] := a[j+63:j]
ELSE
result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
FI
ENDFOR
__m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
__m128d mask, const int s);
#define _mm_mask_i32gather_pd(a, m, i, mask, s)
Conditionally gathers two 64-bit floating-point values, either from the 128-bit vector of [2 x double...
Definition: avx2intrin.h:3999

This intrinsic corresponds to the VGATHERDPD instruction.

Parameters
aA 128-bit vector of [2 x double] used as the source when a mask bit is zero.
mA pointer to the memory used for loading values.
iA 128-bit vector of [4 x i32] containing signed indexes into m. Only the first two elements are used.
maskA 128-bit vector of [2 x double] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [2 x double] containing the gathered values.

Definition at line 3999 of file avx2intrin.h.

◆ _mm_mask_i32gather_ps

#define _mm_mask_i32gather_ps (   a,
  m,
  i,
  mask,
  s 
)
Value:
((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
(float const *)(m), \
(__v4si)(__m128i)(i), \
(__v4sf)(__m128)(mask), (s)))

Conditionally gathers four 32-bit floating-point values, either from the 128-bit vector of [4 x float] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.

The 128-bit vector of [4 x float] in mask determines the source for each element.

FOR element := 0 to 3
j := element*32
k := element*32
IF mask[j+31] == 0
result[j+31:j] := a[j+31:j]
ELSE
result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
FI
ENDFOR
__m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
__m128 mask, const int s);
#define _mm_mask_i32gather_ps(a, m, i, mask, s)
Conditionally gathers four 32-bit floating-point values, either from the 128-bit vector of [4 x float...
Definition: avx2intrin.h:4191

This intrinsic corresponds to the VGATHERDPS instruction.

Parameters
aA 128-bit vector of [4 x float] used as the source when a mask bit is zero.
mA pointer to the memory used for loading values.
iA 128-bit vector of [4 x i32] containing signed indexes into m.
maskA 128-bit vector of [4 x float] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [4 x float] containing the gathered values.

Definition at line 4191 of file avx2intrin.h.

◆ _mm_mask_i64gather_epi32

#define _mm_mask_i64gather_epi32 (   a,
  m,
  i,
  mask,
  s 
)
Value:
((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
(int const *)(m), \
(__v2di)(__m128i)(i), \
(__v4si)(__m128i)(mask), (s)))

Conditionally gathers two 32-bit integer values, either from the 128-bit vector of [4 x i32] in a, or from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.

The 128-bit vector of [4 x i32] in mask determines the source for the lower two elements. The upper two elements of the result are zeroed.

FOR element := 0 to 1
j := element*32
k := element*64
IF mask[j+31] == 0
result[j+31:j] := a[j+31:j]
ELSE
result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
FI
ENDFOR
result[127:64] := 0
__m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
__m128i mask, const int s);
#define _mm_mask_i64gather_epi32(a, m, i, mask, s)
Conditionally gathers two 32-bit integer values, either from the 128-bit vector of [4 x i32] in a,...
Definition: avx2intrin.h:4485

This intrinsic corresponds to the VPGATHERQD instruction.

Parameters
aA 128-bit vector of [4 x i32] used as the source when a mask bit is zero. Only the first two elements are used.
mA pointer to the memory used for loading values.
iA 128-bit vector of [2 x i64] containing indexes into m.
maskA 128-bit vector of [4 x i32] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. Only the first two elements are used.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [4 x i32] containing the gathered values.

Definition at line 4485 of file avx2intrin.h.

◆ _mm_mask_i64gather_epi64

#define _mm_mask_i64gather_epi64 (   a,
  m,
  i,
  mask,
  s 
)
Value:
((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
(long long const *)(m), \
(__v2di)(__m128i)(i), \
(__v2di)(__m128i)(mask), (s)))

Conditionally gathers two 64-bit integer values, either from the 128-bit vector of [2 x i64] in a, or from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.

The 128-bit vector of [2 x i64] in mask determines the source for each element.

FOR element := 0 to 1
j := element*64
k := element*64
IF mask[j+63] == 0
result[j+63:j] := a[j+63:j]
ELSE
result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
FI
ENDFOR
__m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
__m128i mask, const int s);
#define _mm_mask_i64gather_epi64(a, m, i, mask, s)
Conditionally gathers two 64-bit integer values, either from the 128-bit vector of [2 x i64] in a,...
Definition: avx2intrin.h:4678

This intrinsic corresponds to the VPGATHERQQ instruction.

Parameters
aA 128-bit vector of [2 x i64] used as the source when a mask bit is zero.
mA pointer to the memory used for loading values.
iA 128-bit vector of [2 x i64] containing signed indexes into m.
maskA 128-bit vector of [2 x i64] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [2 x i64] containing the gathered values.

Definition at line 4678 of file avx2intrin.h.

◆ _mm_mask_i64gather_pd

#define _mm_mask_i64gather_pd (   a,
  m,
  i,
  mask,
  s 
)
Value:
((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
(double const *)(m), \
(__v2di)(__m128i)(i), \
(__v2df)(__m128d)(mask), (s)))

Conditionally gathers two 64-bit floating-point values, either from the 128-bit vector of [2 x double] in a, or from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.

The 128-bit vector of [2 x double] in mask determines the source for each element.

FOR element := 0 to 1
j := element*64
k := element*64
IF mask[j+63] == 0
result[j+63:j] := a[j+63:j]
ELSE
result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
FI
ENDFOR
__m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
__m128d mask, const int s);
#define _mm_mask_i64gather_pd(a, m, i, mask, s)
Conditionally gathers two 64-bit floating-point values, either from the 128-bit vector of [2 x double...
Definition: avx2intrin.h:4095

This intrinsic corresponds to the VGATHERQPD instruction.

Parameters
aA 128-bit vector of [2 x double] used as the source when a mask bit is zero.
mA pointer to the memory used for loading values.
iA 128-bit vector of [2 x i64] containing signed indexes into m.
maskA 128-bit vector of [2 x double] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [2 x double] containing the gathered values.

Definition at line 4095 of file avx2intrin.h.

◆ _mm_mask_i64gather_ps

#define _mm_mask_i64gather_ps (   a,
  m,
  i,
  mask,
  s 
)
Value:
((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
(float const *)(m), \
(__v2di)(__m128i)(i), \
(__v4sf)(__m128)(mask), (s)))

Conditionally gathers two 32-bit floating-point values, either from the 128-bit vector of [4 x float] in a, or from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.

The 128-bit vector of [4 x float] in mask determines the source for the lower two elements. The upper two elements of the result are zeroed.

FOR element := 0 to 1
j := element*32
k := element*64
IF mask[j+31] == 0
result[j+31:j] := a[j+31:j]
ELSE
result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
FI
ENDFOR
result[127:64] := 0
__m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
__m128 mask, const int s);
#define _mm_mask_i64gather_ps(a, m, i, mask, s)
Conditionally gathers two 32-bit floating-point values, either from the 128-bit vector of [4 x float]...
Definition: avx2intrin.h:4290

This intrinsic corresponds to the VGATHERQPS instruction.

Parameters
aA 128-bit vector of [4 x float] used as the source when a mask bit is zero. Only the first two elements are used.
mA pointer to the memory used for loading values.
iA 128-bit vector of [2 x i64] containing signed indexes into m.
maskA 128-bit vector of [4 x float] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. Only the first two elements are used.
sA literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8.
Returns
A 128-bit vector of [4 x float] containing the gathered values.

Definition at line 4290 of file avx2intrin.h.

Function Documentation

◆ _mm256_abs_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi16 ( __m256i  __a)
static

Computes the absolute value of each signed 16-bit element in the 256-bit vector of [16 x i16] in __a and returns each value in the corresponding element of the result.

This intrinsic corresponds to the VPABSW instruction.

Parameters
__aA 256-bit vector of [16 x i16].
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 116 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_abs_epi16(), and _mm256_maskz_abs_epi16().

◆ _mm256_abs_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi32 ( __m256i  __a)
static

Computes the absolute value of each signed 32-bit element in the 256-bit vector of [8 x i32] in __a and returns each value in the corresponding element of the result.

This intrinsic corresponds to the VPABSD instruction.

Parameters
__aA 256-bit vector of [8 x i32].
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 133 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_abs_epi32(), and _mm256_maskz_abs_epi32().

◆ _mm256_abs_epi8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi8 ( __m256i  __a)
static

Computes the absolute value of each signed byte in the 256-bit integer vector __a and returns each value in the corresponding byte of the result.

This intrinsic corresponds to the VPABSB instruction.

Parameters
__aA 256-bit integer vector.
Returns
A 256-bit integer vector containing the result.

Definition at line 99 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_abs_epi8(), and _mm256_maskz_abs_epi8().

◆ _mm256_add_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] and returns the lower 16 bits of each sum in the corresponding element of the [16 x i16] result (overflow is ignored).

This intrinsic corresponds to the VPADDW instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing one of the source operands.
__bA 256-bit vector of [16 x i16] containing one of the source operands.
Returns
A 256-bit vector of [16 x i16] containing the sums.

Definition at line 297 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_add_epi16(), and _mm256_maskz_add_epi16().

◆ _mm256_add_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi32 ( __m256i  __a,
__m256i  __b 
)
static

Adds 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32] and returns the lower 32 bits of each sum in the corresponding element of the [8 x i32] result (overflow is ignored).

This intrinsic corresponds to the VPADDD instruction.

Parameters
__aA 256-bit vector of [8 x i32] containing one of the source operands.
__bA 256-bit vector of [8 x i32] containing one of the source operands.
Returns
A 256-bit vector of [8 x i32] containing the sums.

Definition at line 316 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_add_epi32(), and _mm256_maskz_add_epi32().

◆ _mm256_add_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi64 ( __m256i  __a,
__m256i  __b 
)
static

Adds 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64] and returns the lower 64 bits of each sum in the corresponding element of the [4 x i64] result (overflow is ignored).

This intrinsic corresponds to the VPADDQ instruction.

Parameters
__aA 256-bit vector of [4 x i64] containing one of the source operands.
__bA 256-bit vector of [4 x i64] containing one of the source operands.
Returns
A 256-bit vector of [4 x i64] containing the sums.

Definition at line 335 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_add_epi64(), and _mm256_maskz_add_epi64().

◆ _mm256_add_epi8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi8 ( __m256i  __a,
__m256i  __b 
)
static

Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors and returns the lower 8 bits of each sum in the corresponding byte of the 256-bit integer vector result (overflow is ignored).

This intrinsic corresponds to the VPADDB instruction.

Parameters
__aA 256-bit integer vector containing one of the source operands.
__bA 256-bit integer vector containing one of the source operands.
Returns
A 256-bit integer vector containing the sums.

Definition at line 278 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_add_epi8(), and _mm256_maskz_add_epi8().

◆ _mm256_adds_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using signed saturation, and returns the [16 x i16] result.

This intrinsic corresponds to the VPADDSW instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing one of the source operands.
__bA 256-bit vector of [16 x i16] containing one of the source operands.
Returns
A 256-bit vector of [16 x i16] containing the sums.

Definition at line 372 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_adds_epi16(), and _mm256_maskz_adds_epi16().

◆ _mm256_adds_epi8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epi8 ( __m256i  __a,
__m256i  __b 
)
static

Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturation, and returns each sum in the corresponding byte of the 256-bit integer vector result.

This intrinsic corresponds to the VPADDSB instruction.

Parameters
__aA 256-bit integer vector containing one of the source operands.
__bA 256-bit integer vector containing one of the source operands.
Returns
A 256-bit integer vector containing the sums.

Definition at line 354 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_adds_epi8(), and _mm256_maskz_adds_epi8().

◆ _mm256_adds_epu16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epu16 ( __m256i  __a,
__m256i  __b 
)
static

Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsigned saturation, and returns the [16 x i16] result.

This intrinsic corresponds to the VPADDUSW instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing one of the source operands.
__bA 256-bit vector of [16 x i16] containing one of the source operands.
Returns
A 256-bit vector of [16 x i16] containing the sums.

Definition at line 409 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_adds_epu16(), and _mm256_maskz_adds_epu16().

◆ _mm256_adds_epu8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epu8 ( __m256i  __a,
__m256i  __b 
)
static

Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned saturation, and returns each sum in the corresponding byte of the 256-bit integer vector result.

This intrinsic corresponds to the VPADDUSB instruction.

Parameters
__aA 256-bit integer vector containing one of the source operands.
__bA 256-bit integer vector containing one of the source operands.
Returns
A 256-bit integer vector containing the sums.

Definition at line 391 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_adds_epu8(), and _mm256_maskz_adds_epu8().

◆ _mm256_and_si256()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_and_si256 ( __m256i  __a,
__m256i  __b 
)
static

Computes the bitwise AND of the 256-bit integer vectors in __a and __b.

This intrinsic corresponds to the VPAND instruction.

Parameters
__aA 256-bit integer vector.
__bA 256-bit integer vector.
Returns
A 256-bit integer vector containing the result.

Definition at line 455 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_test_epi16_mask(), _mm256_mask_test_epi32_mask(), _mm256_mask_test_epi64_mask(), _mm256_mask_test_epi8_mask(), _mm256_mask_testn_epi16_mask(), _mm256_mask_testn_epi32_mask(), _mm256_mask_testn_epi64_mask(), _mm256_mask_testn_epi8_mask(), _mm256_test_epi16_mask(), _mm256_test_epi32_mask(), _mm256_test_epi64_mask(), _mm256_test_epi8_mask(), _mm256_testn_epi16_mask(), _mm256_testn_epi32_mask(), _mm256_testn_epi64_mask(), and _mm256_testn_epi8_mask().

◆ _mm256_andnot_si256()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_andnot_si256 ( __m256i  __a,
__m256i  __b 
)
static

Computes the bitwise AND of the 256-bit integer vector in __b with the bitwise NOT of the 256-bit integer vector in __a.

This intrinsic corresponds to the VPANDN instruction.

Parameters
__aA 256-bit integer vector.
__bA 256-bit integer vector.
Returns
A 256-bit integer vector containing the result.

Definition at line 473 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_avg_epu16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu16 ( __m256i  __a,
__m256i  __b 
)
static

Computes the averages of the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns each average in the corresponding element of the 256-bit result.

FOR i := 0 TO 15
j := i*16
result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
ENDFOR

This intrinsic corresponds to the VPAVGW instruction.

Parameters
__aA 256-bit vector of [16 x i16].
__bA 256-bit vector of [16 x i16].
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 525 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_avg_epu16(), and _mm256_maskz_avg_epu16().

◆ _mm256_avg_epu8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu8 ( __m256i  __a,
__m256i  __b 
)
static

Computes the averages of the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and returns each average in the corresponding byte of the 256-bit result.

FOR i := 0 TO 31
j := i*8
result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
ENDFOR

This intrinsic corresponds to the VPAVGB instruction.

Parameters
__aA 256-bit integer vector.
__bA 256-bit integer vector.
Returns
A 256-bit integer vector containing the result.

Definition at line 499 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_avg_epu8(), and _mm256_maskz_avg_epu8().

◆ _mm256_blendv_epi8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_blendv_epi8 ( __m256i  __V1,
__m256i  __V2,
__m256i  __M 
)
static

Merges 8-bit integer values from either of the two 256-bit vectors __V1 or __V2, as specified by the 256-bit mask __M and returns the resulting 256-bit integer vector.

FOR i := 0 TO 31
j := i*8
IF __M[7+i] == 0
result[7+j:j] := __V1[7+j:j]
ELSE
result[7+j:j] := __V2[7+j:j]
FI
ENDFOR

This intrinsic corresponds to the VPBLENDVB instruction.

Parameters
__V1A 256-bit integer vector containing source values.
__V2A 256-bit integer vector containing source values.
__MA 256-bit integer vector, with bit [7] of each byte specifying the source for each corresponding byte of the result. When the mask bit is 0, the byte is copied from __V1; otherwise, it is copied from __V2.
Returns
A 256-bit integer vector containing the result.

Definition at line 560 of file avx2intrin.h.

◆ _mm256_broadcastb_epi8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastb_epi8 ( __m128i  __X)
static

Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 256-bit result.

This intrinsic corresponds to the VPBROADCASTB instruction.

Parameters
__XA 128-bit integer vector whose low byte will be broadcast.
Returns
A 256-bit integer vector containing the result.

Definition at line 3163 of file avx2intrin.h.

Referenced by _mm256_mask_broadcastb_epi8(), and _mm256_maskz_broadcastb_epi8().

◆ _mm256_broadcastd_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastd_epi32 ( __m128i  __X)
static

Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result's 256-bit vector of [8 x i32].

This intrinsic corresponds to the VPBROADCASTD instruction.

Parameters
__XA 128-bit vector of [4 x i32] whose low element will be broadcast.
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 3195 of file avx2intrin.h.

Referenced by _mm256_mask_broadcastd_epi32(), and _mm256_maskz_broadcastd_epi32().

◆ _mm256_broadcastq_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastq_epi64 ( __m128i  __X)
static

Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to all elements of the result's 256-bit vector of [4 x i64].

This intrinsic corresponds to the VPBROADCASTQ instruction.

Parameters
__XA 128-bit vector of [2 x i64] whose low element will be broadcast.
Returns
A 256-bit vector of [4 x i64] containing the result.

Definition at line 3211 of file avx2intrin.h.

Referenced by _mm256_mask_broadcastq_epi64(), and _mm256_maskz_broadcastq_epi64().

◆ _mm256_broadcastsd_pd()

static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_broadcastsd_pd ( __m128d  __X)
static

Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double] in __X to all elements of the result's 256-bit vector of [4 x double].

This intrinsic corresponds to the VBROADCASTSD instruction.

Parameters
__XA 128-bit vector of [2 x double] whose low element will be broadcast.
Returns
A 256-bit vector of [4 x double] containing the result.

Definition at line 3055 of file avx2intrin.h.

Referenced by _mm256_mask_broadcastsd_pd(), and _mm256_maskz_broadcastsd_pd().

◆ _mm256_broadcastsi128_si256()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastsi128_si256 ( __m128i  __X)
static

Broadcasts the 128-bit integer data from __X to both the lower and upper halves of the 256-bit result.

This intrinsic corresponds to the VBROADCASTI128 instruction.

Parameters
__XA 128-bit integer vector to be broadcast.
Returns
A 256-bit integer vector containing the result.

Definition at line 3071 of file avx2intrin.h.

◆ _mm256_broadcastss_ps()

static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_broadcastss_ps ( __m128  __X)
static

Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] in __X to all elements of the result's 256-bit vector of [8 x float].

This intrinsic corresponds to the VBROADCASTSS instruction.

Parameters
__XA 128-bit vector of [4 x float] whose low element will be broadcast.
Returns
A 256-bit vector of [8 x float] containing the result.

Definition at line 3038 of file avx2intrin.h.

Referenced by _mm256_mask_broadcastss_ps(), and _mm256_maskz_broadcastss_ps().

◆ _mm256_broadcastw_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastw_epi16 ( __m128i  __X)
static

Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result's 256-bit vector of [16 x i16].

This intrinsic corresponds to the VPBROADCASTW instruction.

Parameters
__XA 128-bit vector of [8 x i16] whose low element will be broadcast.
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 3179 of file avx2intrin.h.

Referenced by _mm256_mask_broadcastw_epi16(), and _mm256_maskz_broadcastw_epi16().

◆ _mm256_cmpeq_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Compares corresponding elements in the 256-bit vectors of [16 x i16] in __a and __b for equality and returns the outcomes in the corresponding elements of the 256-bit result.

FOR i := 0 TO 15
j := i*16
result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
ENDFOR

This intrinsic corresponds to the VPCMPEQW instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing one of the inputs.
__bA 256-bit vector of [16 x i16] containing one of the inputs.
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 654 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_cmpeq_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi32 ( __m256i  __a,
__m256i  __b 
)
static

Compares corresponding elements in the 256-bit vectors of [8 x i32] in __a and __b for equality and returns the outcomes in the corresponding elements of the 256-bit result.

FOR i := 0 TO 7
j := i*32
result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
ENDFOR

This intrinsic corresponds to the VPCMPEQD instruction.

Parameters
__aA 256-bit vector of [8 x i32] containing one of the inputs.
__bA 256-bit vector of [8 x i32] containing one of the inputs.
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 680 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_cmpeq_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi64 ( __m256i  __a,
__m256i  __b 
)
static

Compares corresponding elements in the 256-bit vectors of [4 x i64] in __a and __b for equality and returns the outcomes in the corresponding elements of the 256-bit result.

FOR i := 0 TO 3
j := i*64
result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR

This intrinsic corresponds to the VPCMPEQQ instruction.

Parameters
__aA 256-bit vector of [4 x i64] containing one of the inputs.
__bA 256-bit vector of [4 x i64] containing one of the inputs.
Returns
A 256-bit vector of [4 x i64] containing the result.

Definition at line 706 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_cmpeq_epi8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi8 ( __m256i  __a,
__m256i  __b 
)
static

Compares corresponding bytes in the 256-bit integer vectors in __a and __b for equality and returns the outcomes in the corresponding bytes of the 256-bit result.

FOR i := 0 TO 31
j := i*8
result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
ENDFOR

This intrinsic corresponds to the VPCMPEQB instruction.

Parameters
__aA 256-bit integer vector containing one of the inputs.
__bA 256-bit integer vector containing one of the inputs.
Returns
A 256-bit integer vector containing the result.

Definition at line 628 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_cmpgt_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Compares corresponding signed elements in the 256-bit vectors of [16 x i16] in __a and __b for greater-than and returns the outcomes in the corresponding elements of the 256-bit result.

FOR i := 0 TO 15
j := i*16
result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
ENDFOR

This intrinsic corresponds to the VPCMPGTW instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing one of the inputs.
__bA 256-bit vector of [16 x i16] containing one of the inputs.
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 760 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_cmpgt_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi32 ( __m256i  __a,
__m256i  __b 
)
static

Compares corresponding signed elements in the 256-bit vectors of [8 x i32] in __a and __b for greater-than and returns the outcomes in the corresponding elements of the 256-bit result.

FOR i := 0 TO 7
j := i*32
result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
ENDFOR

This intrinsic corresponds to the VPCMPGTD instruction.

Parameters
__aA 256-bit vector of [8 x i32] containing one of the inputs.
__bA 256-bit vector of [8 x i32] containing one of the inputs.
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 786 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_cmpgt_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi64 ( __m256i  __a,
__m256i  __b 
)
static

Compares corresponding signed elements in the 256-bit vectors of [4 x i64] in __a and __b for greater-than and returns the outcomes in the corresponding elements of the 256-bit result.

FOR i := 0 TO 3
j := i*64
result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR

This intrinsic corresponds to the VPCMPGTQ instruction.

Parameters
__aA 256-bit vector of [4 x i64] containing one of the inputs.
__bA 256-bit vector of [4 x i64] containing one of the inputs.
Returns
A 256-bit vector of [4 x i64] containing the result.

Definition at line 812 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_cmpgt_epi8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi8 ( __m256i  __a,
__m256i  __b 
)
static

Compares corresponding signed bytes in the 256-bit integer vectors in __a and __b for greater-than and returns the outcomes in the corresponding bytes of the 256-bit result.

FOR i := 0 TO 31
j := i*8
result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
ENDFOR

This intrinsic corresponds to the VPCMPGTB instruction.

Parameters
__aA 256-bit integer vector containing one of the inputs.
__bA 256-bit integer vector containing one of the inputs.
Returns
A 256-bit integer vector containing the result.

Definition at line 732 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_cvtepi16_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi16_epi32 ( __m128i  __V)
static

Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit values in the corresponding elements of a 256-bit vector of [8 x i32].

FOR i := 0 TO 7
j := i*16
k := i*32
result[k+31:k] := SignExtend(__V[j+15:j])
ENDFOR

This intrinsic corresponds to the VPMOVSXWD instruction.

Parameters
__VA 128-bit vector of [8 x i16] containing the source values.
Returns
A 256-bit vector of [8 x i32] containing the sign-extended values.

Definition at line 1441 of file avx2intrin.h.

Referenced by _mm256_mask_cvtepi16_epi32(), and _mm256_maskz_cvtepi16_epi32().

◆ _mm256_cvtepi16_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi16_epi64 ( __m128i  __V)
static

Sign-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].

result[63:0] := SignExtend(__V[15:0])
result[127:64] := SignExtend(__V[31:16])
result[191:128] := SignExtend(__V[47:32])
result[255:192] := SignExtend(__V[64:48])

This intrinsic corresponds to the VPMOVSXWQ instruction.

Parameters
__VA 128-bit vector of [8 x i16] containing the source values.
Returns
A 256-bit vector of [4 x i64] containing the sign-extended values.

Definition at line 1466 of file avx2intrin.h.

Referenced by _mm256_mask_cvtepi16_epi64(), and _mm256_maskz_cvtepi16_epi64().

◆ _mm256_cvtepi32_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi32_epi64 ( __m128i  __V)
static

Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].

result[63:0] := SignExtend(__V[31:0])
result[127:64] := SignExtend(__V[63:32])
result[191:128] := SignExtend(__V[95:64])
result[255:192] := SignExtend(__V[127:96])

This intrinsic corresponds to the VPMOVSXDQ instruction.

Parameters
__VA 128-bit vector of [4 x i32] containing the source values.
Returns
A 256-bit vector of [4 x i64] containing the sign-extended values.

Definition at line 1491 of file avx2intrin.h.

Referenced by _mm256_mask_cvtepi32_epi64(), and _mm256_maskz_cvtepi32_epi64().

◆ _mm256_cvtepi8_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi8_epi16 ( __m128i  __V)
static

Sign-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corresponding elements of a 256-bit vector of [16 x i16].

FOR i := 0 TO 15
j := i*8
k := i*16
result[k+15:k] := SignExtend(__V[j+7:j])
ENDFOR

This intrinsic corresponds to the VPMOVSXBW instruction.

Parameters
__VA 128-bit integer vector containing the source bytes.
Returns
A 256-bit vector of [16 x i16] containing the sign-extended values.

Definition at line 1358 of file avx2intrin.h.

Referenced by _mm256_mask_cvtepi8_epi16(), and _mm256_maskz_cvtepi8_epi16().

◆ _mm256_cvtepi8_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi8_epi32 ( __m128i  __V)
static

Sign-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit values in the corresponding elements of a 256-bit vector of [8 x i32].

FOR i := 0 TO 7
j := i*8
k := i*32
result[k+31:k] := SignExtend(__V[j+7:j])
ENDFOR

This intrinsic corresponds to the VPMOVSXBD instruction.

Parameters
__VA 128-bit integer vector containing the source bytes.
Returns
A 256-bit vector of [8 x i32] containing the sign-extended values.

Definition at line 1386 of file avx2intrin.h.

Referenced by _mm256_mask_cvtepi8_epi32(), and _mm256_maskz_cvtepi8_epi32().

◆ _mm256_cvtepi8_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi8_epi64 ( __m128i  __V)
static

Sign-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].

result[63:0] := SignExtend(__V[7:0])
result[127:64] := SignExtend(__V[15:8])
result[191:128] := SignExtend(__V[23:16])
result[255:192] := SignExtend(__V[31:24])

This intrinsic corresponds to the VPMOVSXBQ instruction.

Parameters
__VA 128-bit integer vector containing the source bytes.
Returns
A 256-bit vector of [4 x i64] containing the sign-extended values.

Definition at line 1413 of file avx2intrin.h.

Referenced by _mm256_mask_cvtepi8_epi64(), and _mm256_maskz_cvtepi8_epi64().

◆ _mm256_cvtepu16_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu16_epi32 ( __m128i  __V)
static

Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit values in the corresponding elements of a 256-bit vector of [8 x i32].

FOR i := 0 TO 7
j := i*16
k := i*32
result[k+31:k] := ZeroExtend(__V[j+15:j])
ENDFOR

This intrinsic corresponds to the VPMOVZXWD instruction.

Parameters
__VA 128-bit vector of [8 x i16] containing the source values.
Returns
A 256-bit vector of [8 x i32] containing the zero-extended values.

Definition at line 1594 of file avx2intrin.h.

Referenced by _mm256_mask_cvtepu16_epi32(), and _mm256_maskz_cvtepu16_epi32().

◆ _mm256_cvtepu16_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu16_epi64 ( __m128i  __V)
static

Zero-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].

result[63:0] := ZeroExtend(__V[15:0])
result[127:64] := ZeroExtend(__V[31:16])
result[191:128] := ZeroExtend(__V[47:32])
result[255:192] := ZeroExtend(__V[64:48])

This intrinsic corresponds to the VPMOVSXWQ instruction.

Parameters
__VA 128-bit vector of [8 x i16] containing the source values.
Returns
A 256-bit vector of [4 x i64] containing the zero-extended values.

Definition at line 1619 of file avx2intrin.h.

Referenced by _mm256_mask_cvtepu16_epi64(), and _mm256_maskz_cvtepu16_epi64().

◆ _mm256_cvtepu32_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu32_epi64 ( __m128i  __V)
static

Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].

result[63:0] := ZeroExtend(__V[31:0])
result[127:64] := ZeroExtend(__V[63:32])
result[191:128] := ZeroExtend(__V[95:64])
result[255:192] := ZeroExtend(__V[127:96])

This intrinsic corresponds to the VPMOVZXDQ instruction.

Parameters
__VA 128-bit vector of [4 x i32] containing the source values.
Returns
A 256-bit vector of [4 x i64] containing the zero-extended values.

Definition at line 1644 of file avx2intrin.h.

Referenced by _mm256_mask_cvtepu32_epi64(), and _mm256_maskz_cvtepu32_epi64().

◆ _mm256_cvtepu8_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu8_epi16 ( __m128i  __V)
static

Zero-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corresponding elements of a 256-bit vector of [16 x i16].

FOR i := 0 TO 15
j := i*8
k := i*16
result[k+15:k] := ZeroExtend(__V[j+7:j])
ENDFOR

This intrinsic corresponds to the VPMOVZXBW instruction.

Parameters
__VA 128-bit integer vector containing the source bytes.
Returns
A 256-bit vector of [16 x i16] containing the zero-extended values.

Definition at line 1517 of file avx2intrin.h.

Referenced by _mm256_mask_cvtepu8_epi16(), and _mm256_maskz_cvtepu8_epi16().

◆ _mm256_cvtepu8_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu8_epi32 ( __m128i  __V)
static

Zero-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit values in the corresponding elements of a 256-bit vector of [8 x i32].

FOR i := 0 TO 7
j := i*8
k := i*32
result[k+31:k] := ZeroExtend(__V[j+7:j])
ENDFOR

This intrinsic corresponds to the VPMOVZXBD instruction.

Parameters
__VA 128-bit integer vector containing the source bytes.
Returns
A 256-bit vector of [8 x i32] containing the zero-extended values.

Definition at line 1543 of file avx2intrin.h.

Referenced by _mm256_mask_cvtepu8_epi32(), and _mm256_maskz_cvtepu8_epi32().

◆ _mm256_cvtepu8_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu8_epi64 ( __m128i  __V)
static

Zero-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].

result[63:0] := ZeroExtend(__V[7:0])
result[127:64] := ZeroExtend(__V[15:8])
result[191:128] := ZeroExtend(__V[23:16])
result[255:192] := ZeroExtend(__V[31:24])

This intrinsic corresponds to the VPMOVZXBQ instruction.

Parameters
__VA 128-bit integer vector containing the source bytes.
Returns
A 256-bit vector of [4 x i64] containing the zero-extended values.

Definition at line 1568 of file avx2intrin.h.

Referenced by _mm256_mask_cvtepu8_epi64(), and _mm256_maskz_cvtepu8_epi64().

◆ _mm256_hadd_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and returns the lower 16 bits of each sum in an element of the [16 x i16] result (overflow is ignored).

Sums from __a are returned in the lower 64 bits of each 128-bit half of the result; sums from __b are returned in the upper 64 bits of each 128-bit half of the result.

FOR i := 0 TO 1
j := i*128
result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
ENDFOR

This intrinsic corresponds to the VPHADDW instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing one of the source operands.
__bA 256-bit vector of [16 x i16] containing one of the source operands.
Returns
A 256-bit vector of [16 x i16] containing the sums.

Definition at line 848 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_hadd_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi32 ( __m256i  __a,
__m256i  __b 
)
static

Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and returns the lower 32 bits of each sum in an element of the [8 x i32] result (overflow is ignored).

Sums from __a are returned in the lower 64 bits of each 128-bit half of the result; sums from __b are returned in the upper 64 bits of each 128-bit half of the result.

FOR i := 0 TO 1
j := i*128
result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
ENDFOR

This intrinsic corresponds to the VPHADDD instruction.

Parameters
__aA 256-bit vector of [8 x i32] containing one of the source operands.
__bA 256-bit vector of [8 x i32] containing one of the source operands.
Returns
A 256-bit vector of [8 x i32] containing the sums.

Definition at line 880 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_hadds_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadds_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using signed saturation and returns each sum in an element of the [16 x i16] result.

Sums from __a are returned in the lower 64 bits of each 128-bit half of the result; sums from __b are returned in the upper 64 bits of each 128-bit half of the result.

FOR i := 0 TO 1
j := i*128
result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
ENDFOR

This intrinsic corresponds to the VPHADDSW instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing one of the source operands.
__bA 256-bit vector of [16 x i16] containing one of the source operands.
Returns
A 256-bit vector of [16 x i16] containing the sums.

Definition at line 915 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_hsub_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and returns the lower 16 bits of each difference in an element of the [16 x i16] result (overflow is ignored).

Differences from __a are returned in the lower 64 bits of each 128-bit half of the result; differences from __b are returned in the upper 64 bits of each 128-bit half of the result.

FOR i := 0 TO 1
j := i*128
result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
ENDFOR

This intrinsic corresponds to the VPHSUBW instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing one of the source operands.
__bA 256-bit vector of [16 x i16] containing one of the source operands.
Returns
A 256-bit vector of [16 x i16] containing the differences.

Definition at line 951 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_hsub_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi32 ( __m256i  __a,
__m256i  __b 
)
static

Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and returns the lower 32 bits of each difference in an element of the [8 x i32] result (overflow is ignored).

Differences from __a are returned in the lower 64 bits of each 128-bit half of the result; differences from __b are returned in the upper 64 bits of each 128-bit half of the result.

FOR i := 0 TO 1
j := i*128
result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
ENDFOR

This intrinsic corresponds to the VPHSUBD instruction.

Parameters
__aA 256-bit vector of [8 x i32] containing one of the source operands.
__bA 256-bit vector of [8 x i32] containing one of the source operands.
Returns
A 256-bit vector of [8 x i32] containing the differences.

Definition at line 983 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_hsubs_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsubs_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using signed saturation and returns each sum in an element of the [16 x i16] result.

Differences from __a are returned in the lower 64 bits of each 128-bit half of the result; differences from __b are returned in the upper 64 bits of each 128-bit half of the result.

FOR i := 0 TO 1
j := i*128
result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
ENDFOR

This intrinsic corresponds to the VPHSUBSW instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing one of the source operands.
__bA 256-bit vector of [16 x i16] containing one of the source operands.
Returns
A 256-bit vector of [16 x i16] containing the differences.

Definition at line 1019 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_madd_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_madd_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Multiplies corresponding 16-bit elements of two 256-bit vectors of [16 x i16], forming 32-bit intermediate products, and adds pairs of those products to form 32-bit sums returned as elements of the [8 x i32] result.

There is only one wraparound case: when all four of the 16-bit sources are 0x8000, the result will be 0x80000000.

FOR i := 0 TO 7
j := i*32
temp1 := __a[j+15:j] * __b[j+15:j]
temp2 := __a[j+31:j+16] * __b[j+31:j+16]
result[j+31:j] := temp1 + temp2
ENDFOR

This intrinsic corresponds to the VPMADDWD instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing one of the source operands.
__bA 256-bit vector of [16 x i16] containing one of the source operands.
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 1081 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_madd_epi16(), and _mm256_maskz_madd_epi16().

◆ _mm256_maddubs_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maddubs_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Multiplies each unsigned byte from the 256-bit integer vector in __a with the corresponding signed byte from the 256-bit integer vector in __b, forming signed 16-bit intermediate products.

Adds adjacent pairs of those products using signed saturation to form 16-bit sums returned as elements of the [16 x i16] result.

FOR i := 0 TO 15
j := i*16
temp1 := __a[j+7:j] * __b[j+7:j]
temp2 := __a[j+15:j+8] * __b[j+15:j+8]
result[j+15:j] := SATURATE16(temp1 + temp2)
ENDFOR

This intrinsic corresponds to the VPMADDUBSW instruction.

Parameters
__aA 256-bit vector containing one of the source operands.
__bA 256-bit vector containing one of the source operands.
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 1049 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_maddubs_epi16(), and _mm256_maskz_maddubs_epi16().

◆ _mm256_maskload_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi32 ( int const *  __X,
__m256i  __M 
)
static

Conditionally loads eight 32-bit integer elements from memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, sets that element of the result to zero.

Returns the 256-bit [8 x i32] result.

FOR i := 0 TO 7
j := i*32
IF __M[j+31] == 1
result[j+31:j] := Load32(__X+(i*4))
ELSE
result[j+31:j] := 0
FI
ENDFOR

This intrinsic corresponds to the VPMASKMOVD instruction.

Parameters
__XA pointer to the memory used for loading values.
__MA 256-bit vector of [8 x i32] containing the mask bits.
Returns
A 256-bit vector of [8 x i32] containing the loaded or zeroed elements.

Definition at line 3513 of file avx2intrin.h.

◆ _mm256_maskload_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi64 ( long long const *  __X,
__m256i  __M 
)
static

Conditionally loads four 64-bit integer elements from memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, sets that element of the result to zero.

Returns the 256-bit [4 x i64] result.

FOR i := 0 TO 3
j := i*64
IF __M[j+63] == 1
result[j+63:j] := Load64(__X+(i*8))
ELSE
result[j+63:j] := 0
FI
ENDFOR

This intrinsic corresponds to the VPMASKMOVQ instruction.

Parameters
__XA pointer to the memory used for loading values.
__MA 256-bit vector of [4 x i64] containing the mask bits.
Returns
A 256-bit vector of [4 x i64] containing the loaded or zeroed elements.

Definition at line 3545 of file avx2intrin.h.

◆ _mm256_maskstore_epi32()

static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi32 ( int __X,
__m256i  __M,
__m256i  __Y 
)
static

Conditionally stores eight 32-bit integer elements from the 256-bit vector of [8 x i32] in __Y to memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, the memory element is unchanged.

FOR i := 0 TO 7
j := i*32
IF __M[j+31] == 1
Store32(__X+(i*4), __Y[j+31:j])
FI
ENDFOR

This intrinsic corresponds to the VPMASKMOVD instruction.

Parameters
__XA pointer to the memory used for storing values.
__MA 256-bit vector of [8 x i32] containing the mask bits.
__YA 256-bit vector of [8 x i32] containing the values to store.

Definition at line 3639 of file avx2intrin.h.

References __Y.

◆ _mm256_maskstore_epi64()

static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi64 ( long long *  __X,
__m256i  __M,
__m256i  __Y 
)
static

Conditionally stores four 64-bit integer elements from the 256-bit vector of [4 x i64] in __Y to memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, the memory element is unchanged.

FOR i := 0 TO 3
j := i*64
IF __M[j+63] == 1
Store64(__X+(i*8), __Y[j+63:j])
FI
ENDFOR

This intrinsic corresponds to the VPMASKMOVQ instruction.

Parameters
__XA pointer to the memory used for storing values.
__MA 256-bit vector of [4 x i64] containing the mask bits.
__YA 256-bit vector of [4 x i64] containing the values to store.

Definition at line 3669 of file avx2intrin.h.

References __Y.

◆ _mm256_max_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns the larger of each pair in the corresponding element of the 256-bit result.

This intrinsic corresponds to the VPMAXSW instruction.

Parameters
__aA 256-bit vector of [16 x i16].
__bA 256-bit vector of [16 x i16].
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 1119 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_max_epi16(), and _mm256_maskz_max_epi16().

◆ _mm256_max_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi32 ( __m256i  __a,
__m256i  __b 
)
static

Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and __b and returns the larger of each pair in the corresponding element of the 256-bit result.

This intrinsic corresponds to the VPMAXSD instruction.

Parameters
__aA 256-bit vector of [8 x i32].
__bA 256-bit vector of [8 x i32].
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 1138 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_max_epi32(), and _mm256_maskz_max_epi32().

◆ _mm256_max_epi8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi8 ( __m256i  __a,
__m256i  __b 
)
static

Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns the larger of each pair in the corresponding byte of the 256-bit result.

This intrinsic corresponds to the VPMAXSB instruction.

Parameters
__aA 256-bit integer vector.
__bA 256-bit integer vector.
Returns
A 256-bit integer vector containing the result.

Definition at line 1100 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_max_epi8(), and _mm256_maskz_max_epi8().

◆ _mm256_max_epu16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu16 ( __m256i  __a,
__m256i  __b 
)
static

Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns the larger of each pair in the corresponding element of the 256-bit result.

This intrinsic corresponds to the VPMAXUW instruction.

Parameters
__aA 256-bit vector of [16 x i16].
__bA 256-bit vector of [16 x i16].
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 1176 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_max_epu16(), and _mm256_maskz_max_epu16().

◆ _mm256_max_epu32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu32 ( __m256i  __a,
__m256i  __b 
)
static

Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and __b and returns the larger of each pair in the corresponding element of the 256-bit result.

This intrinsic corresponds to the VPMAXUD instruction.

Parameters
__aA 256-bit vector of [8 x i32].
__bA 256-bit vector of [8 x i32].
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 1195 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_max_epu32(), and _mm256_maskz_max_epu32().

◆ _mm256_max_epu8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu8 ( __m256i  __a,
__m256i  __b 
)
static

Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and returns the larger of each pair in the corresponding byte of the 256-bit result.

This intrinsic corresponds to the VPMAXUB instruction.

Parameters
__aA 256-bit integer vector.
__bA 256-bit integer vector.
Returns
A 256-bit integer vector containing the result.

Definition at line 1157 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_max_epu8(), and _mm256_maskz_max_epu8().

◆ _mm256_min_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns the smaller of each pair in the corresponding element of the 256-bit result.

This intrinsic corresponds to the VPMINSW instruction.

Parameters
__aA 256-bit vector of [16 x i16].
__bA 256-bit vector of [16 x i16].
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 1233 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_min_epi16(), and _mm256_maskz_min_epi16().

◆ _mm256_min_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi32 ( __m256i  __a,
__m256i  __b 
)
static

Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and __b and returns the smaller of each pair in the corresponding element of the 256-bit result.

This intrinsic corresponds to the VPMINSD instruction.

Parameters
__aA 256-bit vector of [8 x i32].
__bA 256-bit vector of [8 x i32].
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 1252 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_min_epi32(), and _mm256_maskz_min_epi32().

◆ _mm256_min_epi8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi8 ( __m256i  __a,
__m256i  __b 
)
static

Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns the smaller of each pair in the corresponding byte of the 256-bit result.

This intrinsic corresponds to the VPMINSB instruction.

Parameters
__aA 256-bit integer vector.
__bA 256-bit integer vector.
Returns
A 256-bit integer vector containing the result.

Definition at line 1214 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_min_epi8(), and _mm256_maskz_min_epi8().

◆ _mm256_min_epu16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu16 ( __m256i  __a,
__m256i  __b 
)
static

Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns the smaller of each pair in the corresponding element of the 256-bit result.

This intrinsic corresponds to the VPMINUW instruction.

Parameters
__aA 256-bit vector of [16 x i16].
__bA 256-bit vector of [16 x i16].
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 1290 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_min_epu16(), and _mm256_maskz_min_epu16().

◆ _mm256_min_epu32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu32 ( __m256i  __a,
__m256i  __b 
)
static

Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and __b and returns the smaller of each pair in the corresponding element of the 256-bit result.

This intrinsic corresponds to the VPMINUD instruction.

Parameters
__aA 256-bit vector of [8 x i32].
__bA 256-bit vector of [8 x i32].
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 1309 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_min_epu32(), and _mm256_maskz_min_epu32().

◆ _mm256_min_epu8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu8 ( __m256i  __a,
__m256i  __b 
)
static

Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and returns the smaller of each pair in the corresponding byte of the 256-bit result.

This intrinsic corresponds to the VPMINUB instruction.

Parameters
__aA 256-bit integer vector.
__bA 256-bit integer vector.
Returns
A 256-bit integer vector containing the result.

Definition at line 1271 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_min_epu8(), and _mm256_maskz_min_epu8().

◆ _mm256_movemask_epi8()

static __inline__ int __DEFAULT_FN_ATTRS256 _mm256_movemask_epi8 ( __m256i  __a)
static

Creates a 32-bit integer mask from the most significant bit of each byte in the 256-bit integer vector in __a and returns the result.

FOR i := 0 TO 31
j := i*8
result[i] := __a[j+7]
ENDFOR

This intrinsic corresponds to the VPMOVMSKB instruction.

Parameters
__aA 256-bit integer vector containing the source bytes.
Returns
The 32-bit integer mask.

Definition at line 1332 of file avx2intrin.h.

References __a.

◆ _mm256_mul_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mul_epi32 ( __m256i  __a,
__m256i  __b 
)
static

Multiplies signed 32-bit integers from even-numbered elements of two 256-bit vectors of [8 x i32] and returns the 64-bit products in the [4 x i64] result.

result[63:0] := __a[31:0] * __b[31:0]
result[127:64] := __a[95:64] * __b[95:64]
result[191:128] := __a[159:128] * __b[159:128]
result[255:192] := __a[223:192] * __b[223:192]

This intrinsic corresponds to the VPMULDQ instruction.

Parameters
__aA 256-bit vector of [8 x i32] containing one of the source operands.
__bA 256-bit vector of [8 x i32] containing one of the source operands.
Returns
A 256-bit vector of [4 x i64] containing the products.

Definition at line 1670 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_mul_epi32(), and _mm256_maskz_mul_epi32().

◆ _mm256_mul_epu32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mul_epu32 ( __m256i  __a,
__m256i  __b 
)
static

Multiplies unsigned 32-bit integers from even-numered elements of two 256-bit vectors of [8 x i32] and returns the 64-bit products in the [4 x i64] result.

result[63:0] := __a[31:0] * __b[31:0]
result[127:64] := __a[95:64] * __b[95:64]
result[191:128] := __a[159:128] * __b[159:128]
result[255:192] := __a[223:192] * __b[223:192]

This intrinsic corresponds to the VPMULUDQ instruction.

Parameters
__aA 256-bit vector of [8 x i32] containing one of the source operands.
__bA 256-bit vector of [8 x i32] containing one of the source operands.
Returns
A 256-bit vector of [4 x i64] containing the products.

Definition at line 1799 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_mul_epu32(), and _mm256_maskz_mul_epu32().

◆ _mm256_mulhi_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhi_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upper 16 bits of each 32-bit product in the [16 x i16] result.

This intrinsic corresponds to the VPMULHW instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing one of the source operands.
__bA 256-bit vector of [16 x i16] containing one of the source operands.
Returns
A 256-bit vector of [16 x i16] containing the products.

Definition at line 1735 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_mulhi_epi16(), and _mm256_maskz_mulhi_epi16().

◆ _mm256_mulhi_epu16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhi_epu16 ( __m256i  __a,
__m256i  __b 
)
static

Multiplies unsigned 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upper 16 bits of each 32-bit product in the [16 x i16] result.

This intrinsic corresponds to the VPMULHUW instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing one of the source operands.
__bA 256-bit vector of [16 x i16] containing one of the source operands.
Returns
A 256-bit vector of [16 x i16] containing the products.

Definition at line 1716 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_mulhi_epu16(), and _mm256_maskz_mulhi_epu16().

◆ _mm256_mulhrs_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhrs_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], truncates the 32-bit results to the most significant 18 bits, rounds by adding 1, and returns bits [16:1] of each rounded product in the [16 x i16] result.

FOR i := 0 TO 15
j := i*16
temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
result[j+15:j] := temp[16:1]

This intrinsic corresponds to the VPMULHRSW instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing one of the source operands.
__bA 256-bit vector of [16 x i16] containing one of the source operands.
Returns
A 256-bit vector of [16 x i16] containing the rounded products.

Definition at line 1697 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_mulhrs_epi16(), and _mm256_maskz_mulhrs_epi16().

◆ _mm256_mullo_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mullo_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the lower 16 bits of each 32-bit product in the [16 x i16] result.

This intrinsic corresponds to the VPMULLW instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing one of the source operands.
__bA 256-bit vector of [16 x i16] containing one of the source operands.
Returns
A 256-bit vector of [16 x i16] containing the products.

Definition at line 1754 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_mullo_epi16(), and _mm256_maskz_mullo_epi16().

◆ _mm256_mullo_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mullo_epi32 ( __m256i  __a,
__m256i  __b 
)
static

Multiplies signed 32-bit integer elements of two 256-bit vectors of [8 x i32], and returns the lower 32 bits of each 64-bit product in the [8 x i32] result.

This intrinsic corresponds to the VPMULLD instruction.

Parameters
__aA 256-bit vector of [8 x i32] containing one of the source operands.
__bA 256-bit vector of [8 x i32] containing one of the source operands.
Returns
A 256-bit vector of [8 x i32] containing the products.

Definition at line 1773 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_mullo_epi32(), and _mm256_maskz_mullo_epi32().

◆ _mm256_or_si256()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_or_si256 ( __m256i  __a,
__m256i  __b 
)
static

Computes the bitwise OR of the 256-bit integer vectors in __a and __b.

This intrinsic corresponds to the VPOR instruction.

Parameters
__aA 256-bit integer vector.
__bA 256-bit integer vector.
Returns
A 256-bit integer vector containing the result.

Definition at line 1817 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_packs_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packs_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit integers using signed saturation, and returns the 256-bit result.

FOR i := 0 TO 7
j := i*16
k := i*8
result[7+k:k] := SATURATE8(__a[15+j:j])
result[71+k:64+k] := SATURATE8(__b[15+j:j])
result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
ENDFOR

This intrinsic corresponds to the VPACKSSWB instruction.

Parameters
__aA 256-bit vector of [16 x i16] used to generate result[63:0] and result[191:128].
__bA 256-bit vector of [16 x i16] used to generate result[127:64] and result[255:192].
Returns
A 256-bit integer vector containing the result.

Definition at line 164 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_packs_epi16(), and _mm256_maskz_packs_epi16().

◆ _mm256_packs_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packs_epi32 ( __m256i  __a,
__m256i  __b 
)
static

Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit integers using signed saturation, and returns the resulting 256-bit vector of [16 x i16].

FOR i := 0 TO 3
j := i*32
k := i*16
result[15+k:k] := SATURATE16(__a[31+j:j])
result[79+k:64+k] := SATURATE16(__b[31+j:j])
result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
ENDFOR

This intrinsic corresponds to the VPACKSSDW instruction.

Parameters
__aA 256-bit vector of [8 x i32] used to generate result[63:0] and result[191:128].
__bA 256-bit vector of [8 x i32] used to generate result[127:64] and result[255:192].
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 196 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_packs_epi32(), and _mm256_maskz_packs_epi32().

◆ _mm256_packus_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packus_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers using unsigned saturation, and returns the 256-bit result.

FOR i := 0 TO 7
j := i*16
k := i*8
result[7+k:k] := SATURATE8U(__a[15+j:j])
result[71+k:64+k] := SATURATE8U(__b[15+j:j])
result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
ENDFOR

This intrinsic corresponds to the VPACKUSWB instruction.

Parameters
__aA 256-bit vector of [16 x i16] used to generate result[63:0] and result[191:128].
__bA 256-bit vector of [16 x i16] used to generate result[127:64] and result[255:192].
Returns
A 256-bit integer vector containing the result.

Definition at line 227 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_packus_epi16(), and _mm256_maskz_packus_epi16().

◆ _mm256_packus_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packus_epi32 ( __m256i  __V1,
__m256i  __V2 
)
static

Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers using unsigned saturation, and returns the resulting 256-bit vector of [16 x i16].

FOR i := 0 TO 3
j := i*32
k := i*16
result[15+k:k] := SATURATE16U(__V1[31+j:j])
result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
ENDFOR

This intrinsic corresponds to the VPACKUSDW instruction.

Parameters
__V1A 256-bit vector of [8 x i32] used to generate result[63:0] and result[191:128].
__V2A 256-bit vector of [8 x i32] used to generate result[127:64] and result[255:192].
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 259 of file avx2intrin.h.

Referenced by _mm256_mask_packus_epi32(), and _mm256_maskz_packus_epi32().

◆ _mm256_permutevar8x32_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_epi32 ( __m256i  __a,
__m256i  __b 
)
static

Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 256-bit vector of [8 x i32] in __a as specified by indexes in the elements of the 256-bit vector of [8 x i32] in __b.

FOR i := 0 TO 7
j := i*32
k := __b[j+2:j] * 32
result[j+31:j] := __a[k+31:k]
ENDFOR

This intrinsic corresponds to the VPERMD instruction.

Parameters
__aA 256-bit vector of [8 x i32] containing the source values.
__bA 256-bit vector of [8 x i32] containing indexes of values to use from __a.
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 3303 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_permutevar8x32_ps()

static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_ps ( __m256  __a,
__m256i  __b 
)
static

Sets the result's 256-bit vector of [8 x float] to copies of elements of the 256-bit vector of [8 x float] in __a as specified by indexes in the elements of the 256-bit vector of [8 x i32] in __b.

FOR i := 0 TO 7
j := i*32
k := __b[j+2:j] * 32
result[j+31:j] := __a[k+31:k]
ENDFOR

This intrinsic corresponds to the VPERMPS instruction.

Parameters
__aA 256-bit vector of [8 x float] containing the source values.
__bA 256-bit vector of [8 x i32] containing indexes of values to use from __a.
Returns
A 256-bit vector of [8 x float] containing the result.

Definition at line 3361 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_sad_epu8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sad_epu8 ( __m256i  __a,
__m256i  __b 
)
static

Computes four sum of absolute difference (SAD) operations on sets of eight unsigned 8-bit integers from the 256-bit integer vectors __a and __b.

One SAD result is computed for each set of eight bytes from __a and eight bytes from __b. The zero-extended SAD value is returned in the corresponding 64-bit element of the result.

A single SAD operation takes the differences between the corresponding bytes of __a and __b, takes the absolute value of each difference, and sums these eight values to form one 16-bit result. This operation is repeated four times with successive sets of eight bytes.

FOR i := 0 TO 3
j := i*64
temp0 := ABS(__a[j+7:j] - __b[j+7:j])
temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
temp4 + temp5 + temp6 + temp7
result[j+63:j+16] := 0
ENDFOR

This intrinsic corresponds to the VPSADBW instruction.

Parameters
__aA 256-bit integer vector.
__bA 256-bit integer vector.
Returns
A 256-bit integer vector containing the result.

Definition at line 1862 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_shuffle_epi8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shuffle_epi8 ( __m256i  __a,
__m256i  __b 
)
static

Shuffles 8-bit integers in the 256-bit integer vector __a according to control information in the 256-bit integer vector __b, and returns the 256-bit result.

In effect there are two separate 128-bit shuffles in the lower and upper halves.

FOR i := 0 TO 31
j := i*8
IF __b[j+7] == 1
result[j+7:j] := 0
ELSE
k := __b[j+3:j] * 8
IF i > 15
k := k + 128
FI
result[j+7:j] := __a[k+7:k]
FI
ENDFOR

This intrinsic corresponds to the VPSHUFB instruction.

Parameters
__aA 256-bit integer vector containing source values.
__bA 256-bit integer vector containing control information to determine what goes into the corresponding byte of the result. If bit 7 of the control byte is 1, the result byte is 0; otherwise, bits 3:0 of the control byte specify the index (within the same 128-bit half) of __a to copy to the result byte.
Returns
A 256-bit integer vector containing the result.

Definition at line 1901 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_shuffle_epi8(), and _mm256_maskz_shuffle_epi8().

◆ _mm256_sign_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Sets each element of the result to the corresponding element of the 256-bit vector of [16 x i16] in __a, the negative of that element, or zero, depending on whether the corresponding element of the 256-bit vector of [16 x i16] in __b is greater than zero, less than zero, or equal to zero, respectively.

This intrinsic corresponds to the VPSIGNW instruction.

Parameters
__aA 256-bit vector of [16 x i16].
__bA 256-bit vector of [16 x i16].
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 2049 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_sign_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi32 ( __m256i  __a,
__m256i  __b 
)
static

Sets each element of the result to the corresponding element of the 256-bit vector of [8 x i32] in __a, the negative of that element, or zero, depending on whether the corresponding element of the 256-bit vector of [8 x i32] in __b is greater than zero, less than zero, or equal to zero, respectively.

This intrinsic corresponds to the VPSIGND instruction.

Parameters
__aA 256-bit vector of [8 x i32].
__bA 256-bit vector of [8 x i32].
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 2070 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_sign_epi8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi8 ( __m256i  __a,
__m256i  __b 
)
static

Sets each byte of the result to the corresponding byte of the 256-bit integer vector in __a, the negative of that byte, or zero, depending on whether the corresponding byte of the 256-bit integer vector in __b is greater than zero, less than zero, or equal to zero, respectively.

This intrinsic corresponds to the VPSIGNB instruction.

Parameters
__aA 256-bit integer vector.
__bA 256-bit integer vector].
Returns
A 256-bit integer vector containing the result.

Definition at line 2028 of file avx2intrin.h.

References __a, and __b.

◆ _mm256_sll_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi16 ( __m256i  __a,
__m128i  __count 
)
static

Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by the number of bits specified by the lower 64 bits of __count, shifting in zero bits, and returns the result.

If __count is greater than 15, the returned result is all zeroes.

This intrinsic corresponds to the VPSLLW instruction.

Parameters
__aA 256-bit vector of [16 x i16] to be shifted.
__countA 128-bit vector of [2 x i64] whose lower element gives the unsigned shift count (in bits). The upper element is ignored.
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 2150 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_sll_epi16(), and _mm256_maskz_sll_epi16().

◆ _mm256_sll_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi32 ( __m256i  __a,
__m128i  __count 
)
static

Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result.

If __count is greater than 31, the returned result is all zeroes.

This intrinsic corresponds to the VPSLLD instruction.

Parameters
__aA 256-bit vector of [8 x i32] to be shifted.
__countA 128-bit vector of [2 x i64] whose lower element gives the unsigned shift count (in bits). The upper element is ignored.
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 2190 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_sll_epi32(), and _mm256_maskz_sll_epi32().

◆ _mm256_sll_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi64 ( __m256i  __a,
__m128i  __count 
)
static

Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result.

If __count is greater than 63, the returned result is all zeroes.

This intrinsic corresponds to the VPSLLQ instruction.

Parameters
__aA 256-bit vector of [4 x i64] to be shifted.
__countA 128-bit vector of [2 x i64] whose lower element gives the unsigned shift count (in bits). The upper element is ignored.
Returns
A 256-bit vector of [4 x i64] containing the result.

Definition at line 2230 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_sll_epi64(), and _mm256_maskz_sll_epi64().

◆ _mm256_slli_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi16 ( __m256i  __a,
int  __count 
)
static

Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by __count bits, shifting in zero bits, and returns the result.

If __count is greater than 15, the returned result is all zeroes.

This intrinsic corresponds to the VPSLLW instruction.

Parameters
__aA 256-bit vector of [16 x i16] to be shifted.
__countAn unsigned integer value specifying the shift count (in bits).
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 2129 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_slli_epi16(), and _mm256_maskz_slli_epi16().

◆ _mm256_slli_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi32 ( __m256i  __a,
int  __count 
)
static

Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by __count bits, shifting in zero bits, and returns the result.

If __count is greater than 31, the returned result is all zeroes.

This intrinsic corresponds to the VPSLLD instruction.

Parameters
__aA 256-bit vector of [8 x i32] to be shifted.
__countAn unsigned integer value specifying the shift count (in bits).
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 2169 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_slli_epi32(), and _mm256_maskz_slli_epi32().

◆ _mm256_slli_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi64 ( __m256i  __a,
int  __count 
)
static

Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by __count bits, shifting in zero bits, and returns the result.

If __count is greater than 63, the returned result is all zeroes.

This intrinsic corresponds to the VPSLLQ instruction.

Parameters
__aA 256-bit vector of [4 x i64] to be shifted.
__countAn unsigned integer value specifying the shift count (in bits).
Returns
A 256-bit vector of [4 x i64] containing the result.

Definition at line 2209 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_slli_epi64(), and _mm256_maskz_slli_epi64().

◆ _mm256_sllv_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sllv_epi32 ( __m256i  __X,
__m256i  __Y 
)
static

Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X left by the number of bits given in the corresponding element of the 256-bit vector of [8 x i32] in __Y, shifting in zero bits, and returns the result.

If the shift count for any element is greater than 31, the result for that element is zero.

This intrinsic corresponds to the VPSLLVD instruction.

Parameters
__XA 256-bit vector of [8 x i32] to be shifted.
__YA 256-bit vector of [8 x i32] containing the unsigned shift counts (in bits).
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 3751 of file avx2intrin.h.

References __Y.

Referenced by _mm256_mask_sllv_epi32(), and _mm256_maskz_sllv_epi32().

◆ _mm256_sllv_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sllv_epi64 ( __m256i  __X,
__m256i  __Y 
)
static

Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X left by the number of bits given in the corresponding element of the 128-bit vector of [4 x i64] in __Y, shifting in zero bits, and returns the result.

If the shift count for any element is greater than 63, the result for that element is zero.

This intrinsic corresponds to the VPSLLVQ instruction.

Parameters
__XA 256-bit vector of [4 x i64] to be shifted.
__YA 256-bit vector of [4 x i64] containing the unsigned shift counts (in bits).
Returns
A 256-bit vector of [4 x i64] containing the result.

Definition at line 3795 of file avx2intrin.h.

References __Y.

Referenced by _mm256_mask_sllv_epi64(), and _mm256_maskz_sllv_epi64().

◆ _mm256_sra_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi16 ( __m256i  __a,
__m128i  __count 
)
static

Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits given in the lower 64 bits of __count, shifting in sign bits, and returns the result.

If __count is greater than 15, each element of the result is either 0 or -1 according to the corresponding input sign bit.

This intrinsic corresponds to the VPSRAW instruction.

Parameters
__aA 256-bit vector of [16 x i16] to be shifted.
__countA 128-bit vector of [2 x i64] whose lower element gives the unsigned shift count (in bits). The upper element is ignored.
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 2272 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_sra_epi16(), and _mm256_maskz_sra_epi16().

◆ _mm256_sra_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi32 ( __m256i  __a,
__m128i  __count 
)
static

Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits given in the lower 64 bits of __count, shifting in sign bits, and returns the result.

If __count is greater than 31, each element of the result is either 0 or -1 according to the corresponding input sign bit.

This intrinsic corresponds to the VPSRAD instruction.

Parameters
__aA 256-bit vector of [8 x i32] to be shifted.
__countA 128-bit vector of [2 x i64] whose lower element gives the unsigned shift count (in bits). The upper element is ignored.
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 2314 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_sra_epi32(), and _mm256_maskz_sra_epi32().

◆ _mm256_srai_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi16 ( __m256i  __a,
int  __count 
)
static

Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits, shifting in sign bits, and returns the result.

If __count is greater than 15, each element of the result is either 0 or -1 according to the corresponding input sign bit.

This intrinsic corresponds to the VPSRAW instruction.

Parameters
__aA 256-bit vector of [16 x i16] to be shifted.
__countAn unsigned integer value specifying the shift count (in bits).
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 2250 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_srai_epi16(), and _mm256_maskz_srai_epi16().

◆ _mm256_srai_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi32 ( __m256i  __a,
int  __count 
)
static

Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits, shifting in sign bits, and returns the result.

If __count is greater than 31, each element of the result is either 0 or -1 according to the corresponding input sign bit.

This intrinsic corresponds to the VPSRAD instruction.

Parameters
__aA 256-bit vector of [8 x i32] to be shifted.
__countAn unsigned integer value specifying the shift count (in bits).
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 2292 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_srai_epi32(), and _mm256_maskz_srai_epi32().

◆ _mm256_srav_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srav_epi32 ( __m256i  __X,
__m256i  __Y 
)
static

Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits given in the corresponding element of the 256-bit vector of [8 x i32] in __Y, shifting in sign bits, and returns the result.

If the shift count for any element is greater than 31, the result for that element is 0 or -1 according to the sign bit for that element.

This intrinsic corresponds to the VPSRAVD instruction.

Parameters
__XA 256-bit vector of [8 x i32] to be shifted.
__YA 256-bit vector of [8 x i32] containing the unsigned shift counts (in bits).
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 3840 of file avx2intrin.h.

References __Y.

Referenced by _mm256_mask_srav_epi32(), and _mm256_maskz_srav_epi32().

◆ _mm256_srl_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi16 ( __m256i  __a,
__m128i  __count 
)
static

Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result.

If __count is greater than 15, the returned result is all zeroes.

This intrinsic corresponds to the VPSRLW instruction.

Parameters
__aA 256-bit vector of [16 x i16] to be shifted.
__countA 128-bit vector of [2 x i64] whose lower element gives the unsigned shift count (in bits). The upper element is ignored.
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 2394 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_srl_epi16(), and _mm256_maskz_srl_epi16().

◆ _mm256_srl_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi32 ( __m256i  __a,
__m128i  __count 
)
static

Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result.

If __count is greater than 31, the returned result is all zeroes.

This intrinsic corresponds to the VPSRLD instruction.

Parameters
__aA 256-bit vector of [8 x i32] to be shifted.
__countA 128-bit vector of [2 x i64] whose lower element gives the unsigned shift count (in bits). The upper element is ignored.
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 2434 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_srl_epi32(), and _mm256_maskz_srl_epi32().

◆ _mm256_srl_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi64 ( __m256i  __a,
__m128i  __count 
)
static

Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result.

If __count is greater than 63, the returned result is all zeroes.

This intrinsic corresponds to the VPSRLQ instruction.

Parameters
__aA 256-bit vector of [4 x i64] to be shifted.
__countA 128-bit vector of [2 x i64] whose lower element gives the unsigned shift count (in bits). The upper element is ignored.
Returns
A 256-bit vector of [4 x i64] containing the result.

Definition at line 2474 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_srl_epi64(), and _mm256_maskz_srl_epi64().

◆ _mm256_srli_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi16 ( __m256i  __a,
int  __count 
)
static

Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits, shifting in zero bits, and returns the result.

If __count is greater than 15, the returned result is all zeroes.

This intrinsic corresponds to the VPSRLW instruction.

Parameters
__aA 256-bit vector of [16 x i16] to be shifted.
__countAn unsigned integer value specifying the shift count (in bits).
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 2373 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_srli_epi16(), and _mm256_maskz_srli_epi16().

◆ _mm256_srli_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi32 ( __m256i  __a,
int  __count 
)
static

Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits, shifting in zero bits, and returns the result.

If __count is greater than 31, the returned result is all zeroes.

This intrinsic corresponds to the VPSRLD instruction.

Parameters
__aA 256-bit vector of [8 x i32] to be shifted.
__countAn unsigned integer value specifying the shift count (in bits).
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 2413 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_srli_epi32(), and _mm256_maskz_srli_epi32().

◆ _mm256_srli_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi64 ( __m256i  __a,
int  __count 
)
static

Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by __count bits, shifting in zero bits, and returns the result.

If __count is greater than 63, the returned result is all zeroes.

This intrinsic corresponds to the VPSRLQ instruction.

Parameters
__aA 256-bit vector of [4 x i64] to be shifted.
__countAn unsigned integer value specifying the shift count (in bits).
Returns
A 256-bit vector of [4 x i64] containing the result.

Definition at line 2453 of file avx2intrin.h.

References __a.

Referenced by _mm256_mask_srli_epi64(), and _mm256_maskz_srli_epi64().

◆ _mm256_srlv_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srlv_epi32 ( __m256i  __X,
__m256i  __Y 
)
static

Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits given in the corresponding element of the 256-bit vector of [8 x i32] in __Y, shifting in zero bits, and returns the result.

If the shift count for any element is greater than 31, the result for that element is zero.

This intrinsic corresponds to the VPSRLVD instruction.

Parameters
__XA 256-bit vector of [8 x i32] to be shifted.
__YA 256-bit vector of [8 x i32] containing the unsigned shift counts (in bits).
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 3885 of file avx2intrin.h.

References __Y.

Referenced by _mm256_mask_srlv_epi32(), and _mm256_maskz_srlv_epi32().

◆ _mm256_srlv_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srlv_epi64 ( __m256i  __X,
__m256i  __Y 
)
static

Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X right by the number of bits given in the corresponding element of the 128-bit vector of [4 x i64] in __Y, shifting in zero bits, and returns the result.

If the shift count for any element is greater than 63, the result for that element is zero.

This intrinsic corresponds to the VPSRLVQ instruction.

Parameters
__XA 256-bit vector of [4 x i64] to be shifted.
__YA 256-bit vector of [4 x i64] containing the unsigned shift counts (in bits).
Returns
A 256-bit vector of [4 x i64] containing the result.

Definition at line 3929 of file avx2intrin.h.

References __Y.

Referenced by _mm256_mask_srlv_epi64(), and _mm256_maskz_srlv_epi64().

◆ _mm256_stream_load_si256()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_stream_load_si256 ( const void *  __V)
static

Loads the 256-bit integer vector from memory __V using a non-temporal memory hint and returns the vector.

__V must be aligned on a 32-byte boundary.

This intrinsic corresponds to the VMOVNTDQA instruction.

Parameters
__VA pointer to the 32-byte aligned memory containing the vector to load.
Returns
A 256-bit integer vector loaded from memory.

Definition at line 2986 of file avx2intrin.h.

◆ _mm256_sub_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16].

Returns the lower 16 bits of each difference in the corresponding element of the [16 x i16] result (overflow is ignored).

FOR i := 0 TO 15
j := i*16
result[j+15:j] := __a[j+15:j] - __b[j+15:j]
ENDFOR

This intrinsic corresponds to the VPSUBW instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing the minuends.
__bA 256-bit vector of [16 x i16] containing the subtrahends.
Returns
A 256-bit vector of [16 x i16] containing the differences.

Definition at line 2528 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_sub_epi16(), and _mm256_maskz_sub_epi16().

◆ _mm256_sub_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi32 ( __m256i  __a,
__m256i  __b 
)
static

Subtracts 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32].

Returns the lower 32 bits of each difference in the corresponding element of the [8 x i32] result (overflow is ignored).

FOR i := 0 TO 7
j := i*32
result[j+31:j] := __a[j+31:j] - __b[j+31:j]
ENDFOR

This intrinsic corresponds to the VPSUBD instruction.

Parameters
__aA 256-bit vector of [8 x i32] containing the minuends.
__bA 256-bit vector of [8 x i32] containing the subtrahends.
Returns
A 256-bit vector of [8 x i32] containing the differences.

Definition at line 2554 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_sub_epi32(), and _mm256_maskz_sub_epi32().

◆ _mm256_sub_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi64 ( __m256i  __a,
__m256i  __b 
)
static

Subtracts 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64].

Returns the lower 64 bits of each difference in the corresponding element of the [4 x i64] result (overflow is ignored).

FOR i := 0 TO 3
j := i*64
result[j+63:j] := __a[j+63:j] - __b[j+63:j]
ENDFOR

This intrinsic corresponds to the VPSUBQ instruction.

Parameters
__aA 256-bit vector of [4 x i64] containing the minuends.
__bA 256-bit vector of [4 x i64] containing the subtrahends.
Returns
A 256-bit vector of [4 x i64] containing the differences.

Definition at line 2580 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_sub_epi64(), and _mm256_maskz_sub_epi64().

◆ _mm256_sub_epi8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi8 ( __m256i  __a,
__m256i  __b 
)
static

Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors.

Returns the lower 8 bits of each difference in the corresponding byte of the 256-bit integer vector result (overflow is ignored).

FOR i := 0 TO 31
j := i*8
result[j+7:j] := __a[j+7:j] - __b[j+7:j]
ENDFOR

This intrinsic corresponds to the VPSUBB instruction.

Parameters
__aA 256-bit integer vector containing the minuends.
__bA 256-bit integer vector containing the subtrahends.
Returns
A 256-bit integer vector containing the differences.

Definition at line 2501 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_sub_epi8(), and _mm256_maskz_sub_epi8().

◆ _mm256_subs_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using signed saturation, and returns each difference in the corresponding element of the [16 x i16] result.

FOR i := 0 TO 15
j := i*16
result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
ENDFOR

This intrinsic corresponds to the VPSUBSW instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing the minuends.
__bA 256-bit vector of [16 x i16] containing the subtrahends.
Returns
A 256-bit vector of [16 x i16] containing the differences.

Definition at line 2632 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_subs_epi16(), and _mm256_maskz_subs_epi16().

◆ _mm256_subs_epi8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epi8 ( __m256i  __a,
__m256i  __b 
)
static

Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturation, and returns each differences in the corresponding byte of the 256-bit integer vector result.

FOR i := 0 TO 31
j := i*8
result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
ENDFOR

This intrinsic corresponds to the VPSUBSB instruction.

Parameters
__aA 256-bit integer vector containing the minuends.
__bA 256-bit integer vector containing the subtrahends.
Returns
A 256-bit integer vector containing the differences.

Definition at line 2606 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_subs_epi8(), and _mm256_maskz_subs_epi8().

◆ _mm256_subs_epu16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epu16 ( __m256i  __a,
__m256i  __b 
)
static

Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsigned saturation, and returns each difference in the corresponding element of the [16 x i16] result.

FOR i := 0 TO 15
j := i*16
result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
ENDFOR

This intrinsic corresponds to the VPSUBUSW instruction.

Parameters
__aA 256-bit vector of [16 x i16] containing the minuends.
__bA 256-bit vector of [16 x i16] containing the subtrahends.
Returns
A 256-bit vector of [16 x i16] containing the differences.

Definition at line 2685 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_subs_epu16(), and _mm256_maskz_subs_epu16().

◆ _mm256_subs_epu8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epu8 ( __m256i  __a,
__m256i  __b 
)
static

Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned saturation, and returns each difference in the corresponding byte of the 256-bit integer vector result.

For each byte, computes result = __a - __b .

FOR i := 0 TO 31
j := i*8
result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
ENDFOR

This intrinsic corresponds to the VPSUBUSB instruction.

Parameters
__aA 256-bit integer vector containing the minuends.
__bA 256-bit integer vector containing the subtrahends.
Returns
A 256-bit integer vector containing the differences.

Definition at line 2659 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_subs_epu8(), and _mm256_maskz_subs_epu8().

◆ _mm256_unpackhi_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __b to return the resulting 256-bit vector of [16 x i16].

Specifically, uses the upper 64 bits of each 128-bit half of __a and __b as input; other bits in these parameters are ignored.

result[15:0] := __a[79:64]
result[31:16] := __b[79:64]
result[47:32] := __a[95:80]
result[63:48] := __b[95:80]
. . .
result[127:112] := __b[127:112]
result[143:128] := __a[211:196]
. . .
result[255:240] := __b[255:240]

This intrinsic corresponds to the VPUNPCKHWD instruction.

Parameters
__aA 256-bit vector of [16 x i16] used as the source for the even-numbered elements of the result.
__bA 256-bit vector of [16 x i16] used as the source for the odd-numbered elements of the result.
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 2754 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_unpackhi_epi16(), and _mm256_maskz_unpackhi_epi16().

◆ _mm256_unpackhi_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi32 ( __m256i  __a,
__m256i  __b 
)
static

Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b to return the resulting 256-bit vector of [8 x i32].

Specifically, uses the upper 64 bits of each 128-bit half of __a and __b as input; other bits in these parameters are ignored.

result[31:0] := __a[95:64]
result[63:32] := __b[95:64]
result[95:64] := __a[127:96]
result[127:96] := __b[127:96]
result[159:128] := __a[223:192]
result[191:160] := __b[223:192]
result[223:192] := __a[255:224]
result[255:224] := __b[255:224]

This intrinsic corresponds to the VPUNPCKHDQ instruction.

Parameters
__aA 256-bit vector of [8 x i32] used as the source for the even-numbered elements of the result.
__bA 256-bit vector of [8 x i32] used as the source for the odd-numbered elements of the result.
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 2788 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_unpackhi_epi32(), and _mm256_maskz_unpackhi_epi32().

◆ _mm256_unpackhi_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi64 ( __m256i  __a,
__m256i  __b 
)
static

Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b to return the resulting 256-bit vector of [4 x i64].

Specifically, uses the upper 64 bits of each 128-bit half of __a and __b as input; other bits in these parameters are ignored.

result[63:0] := __a[127:64]
result[127:64] := __b[127:64]
result[191:128] := __a[255:192]
result[255:192] := __b[255:192]

This intrinsic corresponds to the VPUNPCKHQDQ instruction.

Parameters
__aA 256-bit vector of [4 x i64] used as the source for the even-numbered elements of the result.
__bA 256-bit vector of [4 x i64] used as the source for the odd-numbered elements of the result.
Returns
A 256-bit vector of [4 x i64] containing the result.

Definition at line 2818 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_unpackhi_epi64(), and _mm256_maskz_unpackhi_epi64().

◆ _mm256_unpackhi_epi8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi8 ( __m256i  __a,
__m256i  __b 
)
static

Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to form the 256-bit result.

Specifically, uses the upper 64 bits of each 128-bit half of __a and __b as input; other bits in these parameters are ignored.

result[7:0] := __a[71:64]
result[15:8] := __b[71:64]
result[23:16] := __a[79:72]
result[31:24] := __b[79:72]
. . .
result[127:120] := __b[127:120]
result[135:128] := __a[199:192]
. . .
result[255:248] := __b[255:248]

This intrinsic corresponds to the VPUNPCKHBW instruction.

Parameters
__aA 256-bit integer vector used as the source for the even-numbered bytes of the result.
__bA 256-bit integer vector used as the source for the odd-numbered bytes of the result.
Returns
A 256-bit integer vector containing the result.

Definition at line 2719 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_unpackhi_epi8(), and _mm256_maskz_unpackhi_epi8().

◆ _mm256_unpacklo_epi16()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi16 ( __m256i  __a,
__m256i  __b 
)
static

Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __b to return the resulting 256-bit vector of [16 x i16].

Specifically, uses the lower 64 bits of each 128-bit half of __a and __b as input; other bits in these parameters are ignored.

result[15:0] := __a[15:0]
result[31:16] := __b[15:0]
result[47:32] := __a[31:16]
result[63:48] := __b[31:16]
. . .
result[127:112] := __b[63:48]
result[143:128] := __a[143:128]
. . .
result[255:239] := __b[191:176]

This intrinsic corresponds to the VPUNPCKLWD instruction.

Parameters
__aA 256-bit vector of [16 x i16] used as the source for the even-numbered elements of the result.
__bA 256-bit vector of [16 x i16] used as the source for the odd-numbered elements of the result.
Returns
A 256-bit vector of [16 x i16] containing the result.

Definition at line 2887 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_unpacklo_epi16(), and _mm256_maskz_unpacklo_epi16().

◆ _mm256_unpacklo_epi32()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi32 ( __m256i  __a,
__m256i  __b 
)
static

Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b to return the resulting 256-bit vector of [8 x i32].

Specifically, uses the lower 64 bits of each 128-bit half of __a and __b as input; other bits in these parameters are ignored.

result[31:0] := __a[31:0]
result[63:32] := __b[31:0]
result[95:64] := __a[63:32]
result[127:96] := __b[63:32]
result[159:128] := __a[159:128]
result[191:160] := __b[159:128]
result[223:192] := __a[191:160]
result[255:224] := __b[191:190]

This intrinsic corresponds to the VPUNPCKLDQ instruction.

Parameters
__aA 256-bit vector of [8 x i32] used as the source for the even-numbered elements of the result.
__bA 256-bit vector of [8 x i32] used as the source for the odd-numbered elements of the result.
Returns
A 256-bit vector of [8 x i32] containing the result.

Definition at line 2921 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_unpacklo_epi32(), and _mm256_maskz_unpacklo_epi32().

◆ _mm256_unpacklo_epi64()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi64 ( __m256i  __a,
__m256i  __b 
)
static

Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b to return the resulting 256-bit vector of [4 x i64].

Specifically, uses the lower 64 bits of each 128-bit half of __a and __b as input; other bits in these parameters are ignored.

result[63:0] := __a[63:0]
result[127:64] := __b[63:0]
result[191:128] := __a[191:128]
result[255:192] := __b[191:128]

This intrinsic corresponds to the VPUNPCKLQDQ instruction.

Parameters
__aA 256-bit vector of [4 x i64] used as the source for the even-numbered elements of the result.
__bA 256-bit vector of [4 x i64] used as the source for the odd-numbered elements of the result.
Returns
A 256-bit vector of [4 x i64] containing the result.

Definition at line 2951 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_unpacklo_epi64(), and _mm256_maskz_unpacklo_epi64().

◆ _mm256_unpacklo_epi8()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi8 ( __m256i  __a,
__m256i  __b 
)
static

Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to form the 256-bit result.

Specifically, uses the lower 64 bits of each 128-bit half of __a and __b as input; other bits in these parameters are ignored.

result[7:0] := __a[7:0]
result[15:8] := __b[7:0]
result[23:16] := __a[15:8]
result[31:24] := __b[15:8]
. . .
result[127:120] := __b[63:56]
result[135:128] := __a[135:128]
. . .
result[255:248] := __b[191:184]

This intrinsic corresponds to the VPUNPCKLBW instruction.

Parameters
__aA 256-bit integer vector used as the source for the even-numbered bytes of the result.
__bA 256-bit integer vector used as the source for the odd-numbered bytes of the result.
Returns
A 256-bit integer vector containing the result.

Definition at line 2852 of file avx2intrin.h.

References __a, and __b.

Referenced by _mm256_mask_unpacklo_epi8(), and _mm256_maskz_unpacklo_epi8().

◆ _mm256_xor_si256()

static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_xor_si256 ( __m256i  __a,
__m256i  __b 
)
static

Computes the bitwise XOR of the 256-bit integer vectors in __a and __b.

This intrinsic corresponds to the VPXOR instruction.

Parameters
__aA 256-bit integer vector.
__bA 256-bit integer vector.
Returns
A 256-bit integer vector containing the result.

Definition at line 2969 of file avx2intrin.h.

References __a, and __b.

◆ _mm_broadcastb_epi8()

static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastb_epi8 ( __m128i  __X)
static

Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 128-bit result.

This intrinsic corresponds to the VPBROADCASTB instruction.

Parameters
__XA 128-bit integer vector whose low byte will be broadcast.
Returns
A 128-bit integer vector containing the result.

Definition at line 3227 of file avx2intrin.h.

Referenced by _mm_mask_broadcastb_epi8(), and _mm_maskz_broadcastb_epi8().

◆ _mm_broadcastd_epi32()

static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastd_epi32 ( __m128i  __X)
static

Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result's vector of [4 x i32].

This intrinsic corresponds to the VPBROADCASTD instruction.

Parameters
__XA 128-bit vector of [4 x i32] whose low element will be broadcast.
Returns
A 128-bit vector of [4 x i32] containing the result.

Definition at line 3259 of file avx2intrin.h.

Referenced by _mm_mask_broadcastd_epi32(), and _mm_maskz_broadcastd_epi32().

◆ _mm_broadcastq_epi64()

static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastq_epi64 ( __m128i  __X)
static

Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to both elements of the result's 128-bit vector of [2 x i64].

This intrinsic corresponds to the VPBROADCASTQ instruction.

Parameters
__XA 128-bit vector of [2 x i64] whose low element will be broadcast.
Returns
A 128-bit vector of [2 x i64] containing the result.

Definition at line 3275 of file avx2intrin.h.

Referenced by _mm_mask_broadcastq_epi64(), and _mm_maskz_broadcastq_epi64().

◆ _mm_broadcastsd_pd()

static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_broadcastsd_pd ( __m128d  __a)
static

Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double] in __a to both elements of the result's 128-bit vector of [2 x double].

This intrinsic corresponds to the MOVDDUP instruction.

Parameters
__aA 128-bit vector of [2 x double] whose low element will be broadcast.
Returns
A 128-bit vector of [2 x double] containing the result.

Definition at line 3021 of file avx2intrin.h.

References __a.

◆ _mm_broadcastss_ps()

static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_broadcastss_ps ( __m128  __X)
static

Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] in __X to all elements of the result's 128-bit vector of [4 x float].

This intrinsic corresponds to the VBROADCASTSS instruction.

Parameters
__XA 128-bit vector of [4 x float] whose low element will be broadcast.
Returns
A 128-bit vector of [4 x float] containing the result.

Definition at line 3004 of file avx2intrin.h.

Referenced by _mm_mask_broadcastss_ps(), and _mm_maskz_broadcastss_ps().

◆ _mm_broadcastw_epi16()

static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastw_epi16 ( __m128i  __X)
static

Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result's 128-bit vector of [8 x i16].

This intrinsic corresponds to the VPBROADCASTW instruction.

Parameters
__XA 128-bit vector of [8 x i16] whose low element will be broadcast.
Returns
A 128-bit vector of [8 x i16] containing the result.

Definition at line 3243 of file avx2intrin.h.

Referenced by _mm_mask_broadcastw_epi16(), and _mm_maskz_broadcastw_epi16().

◆ _mm_maskload_epi32()

static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi32 ( int const *  __X,
__m128i  __M 
)
static

Conditionally loads four 32-bit integer elements from memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, sets that element of the result to zero.

Returns the 128-bit [4 x i32] result.

FOR i := 0 TO 3
j := i*32
IF __M[j+31] == 1
result[j+31:j] := Load32(__X+(i*4))
ELSE
result[j+31:j] := 0
FI
ENDFOR

This intrinsic corresponds to the VPMASKMOVD instruction.

Parameters
__XA pointer to the memory used for loading values.
__MA 128-bit vector of [4 x i32] containing the mask bits.
Returns
A 128-bit vector of [4 x i32] containing the loaded or zeroed elements.

Definition at line 3577 of file avx2intrin.h.

◆ _mm_maskload_epi64()

static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi64 ( long long const *  __X,
__m128i  __M 
)
static

Conditionally loads two 64-bit integer elements from memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, sets that element of the result to zero.

Returns the 128-bit [2 x i64] result.

FOR i := 0 TO 1
j := i*64
IF __M[j+63] == 1
result[j+63:j] := Load64(__X+(i*8))
ELSE
result[j+63:j] := 0
FI
ENDFOR

This intrinsic corresponds to the VPMASKMOVQ instruction.

Parameters
__XA pointer to the memory used for loading values.
__MA 128-bit vector of [2 x i64] containing the mask bits.
Returns
A 128-bit vector of [2 x i64] containing the loaded or zeroed elements.

Definition at line 3609 of file avx2intrin.h.

◆ _mm_maskstore_epi32()

static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi32 ( int __X,
__m128i  __M,
__m128i  __Y 
)
static

Conditionally stores four 32-bit integer elements from the 128-bit vector of [4 x i32] in __Y to memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, the memory element is unchanged.

FOR i := 0 TO 3
j := i*32
IF __M[j+31] == 1
Store32(__X+(i*4), __Y[j+31:j])
FI
ENDFOR

This intrinsic corresponds to the VPMASKMOVD instruction.

Parameters
__XA pointer to the memory used for storing values.
__MA 128-bit vector of [4 x i32] containing the mask bits.
__YA 128-bit vector of [4 x i32] containing the values to store.

Definition at line 3699 of file avx2intrin.h.

References __Y.

◆ _mm_maskstore_epi64()

static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi64 ( long long *  __X,
__m128i  __M,
__m128i  __Y 
)
static

Conditionally stores two 64-bit integer elements from the 128-bit vector of [2 x i64] in __Y to memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, the memory element is unchanged.

FOR i := 0 TO 1
j := i*64
IF __M[j+63] == 1
Store64(__X+(i*8), __Y[j+63:j])
FI
ENDFOR

This intrinsic corresponds to the VPMASKMOVQ instruction.

Parameters
__XA pointer to the memory used for storing values.
__MA 128-bit vector of [2 x i64] containing the mask bits.
__YA 128-bit vector of [2 x i64] containing the values to store.

Definition at line 3729 of file avx2intrin.h.

References __Y.

◆ _mm_sllv_epi32()

static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_sllv_epi32 ( __m128i  __X,
__m128i  __Y 
)
static

Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X left by the number of bits given in the corresponding element of the 128-bit vector of [4 x i32] in __Y, shifting in zero bits, and returns the result.

If the shift count for any element is greater than 31, the result for that element is zero.

This intrinsic corresponds to the VPSLLVD instruction.

Parameters
__XA 128-bit vector of [4 x i32] to be shifted.
__YA 128-bit vector of [4 x i32] containing the unsigned shift counts (in bits).
Returns
A 128-bit vector of [4 x i32] containing the result.

Definition at line 3773 of file avx2intrin.h.

References __Y.

Referenced by _mm_mask_sllv_epi32(), and _mm_maskz_sllv_epi32().

◆ _mm_sllv_epi64()

static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_sllv_epi64 ( __m128i  __X,
__m128i  __Y 
)
static

Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X left by the number of bits given in the corresponding element of the 128-bit vector of [2 x i64] in __Y, shifting in zero bits, and returns the result.

If the shift count for any element is greater than 63, the result for that element is zero.

This intrinsic corresponds to the VPSLLVQ instruction.

Parameters
__XA 128-bit vector of [2 x i64] to be shifted.
__YA 128-bit vector of [2 x i64] containing the unsigned shift counts (in bits).
Returns
A 128-bit vector of [2 x i64] containing the result.

Definition at line 3817 of file avx2intrin.h.

References __Y.

Referenced by _mm_mask_sllv_epi64(), and _mm_maskz_sllv_epi64().

◆ _mm_srav_epi32()

static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srav_epi32 ( __m128i  __X,
__m128i  __Y 
)
static

Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits given in the corresponding element of the 128-bit vector of [4 x i32] in __Y, shifting in sign bits, and returns the result.

If the shift count for any element is greater than 31, the result for that element is 0 or -1 according to the sign bit for that element.

This intrinsic corresponds to the VPSRAVD instruction.

Parameters
__XA 128-bit vector of [4 x i32] to be shifted.
__YA 128-bit vector of [4 x i32] containing the unsigned shift counts (in bits).
Returns
A 128-bit vector of [4 x i32] containing the result.

Definition at line 3863 of file avx2intrin.h.

References __Y.

Referenced by _mm_mask_srav_epi32(), and _mm_maskz_srav_epi32().

◆ _mm_srlv_epi32()

static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srlv_epi32 ( __m128i  __X,
__m128i  __Y 
)
static

Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits given in the corresponding element of the 128-bit vector of [4 x i32] in __Y, shifting in zero bits, and returns the result.

If the shift count for any element is greater than 31, the result for that element is zero.

This intrinsic corresponds to the VPSRLVD instruction.

Parameters
__XA 128-bit vector of [4 x i32] to be shifted.
__YA 128-bit vector of [4 x i32] containing the unsigned shift counts (in bits).
Returns
A 128-bit vector of [4 x i32] containing the result.

Definition at line 3907 of file avx2intrin.h.

References __Y.

Referenced by _mm_mask_srlv_epi32(), and _mm_maskz_srlv_epi32().

◆ _mm_srlv_epi64()

static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srlv_epi64 ( __m128i  __X,
__m128i  __Y 
)
static

Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X right by the number of bits given in the corresponding element of the 128-bit vector of [2 x i64] in __Y, shifting in zero bits, and returns the result.

If the shift count for any element is greater than 63, the result for that element is zero.

This intrinsic corresponds to the VPSRLVQ instruction.

Parameters
__XA 128-bit vector of [2 x i64] to be shifted.
__YA 128-bit vector of [2 x i64] containing the unsigned shift counts (in bits).
Returns
A 128-bit vector of [2 x i64] containing the result.

Definition at line 3951 of file avx2intrin.h.

References __Y.

Referenced by _mm_mask_srlv_epi64(), and _mm_maskz_srlv_epi64().