clang 23.0.0git
riscv_packed_simd.h
Go to the documentation of this file.
1/*===---- riscv_packed_simd.h - RISC-V P intrinsics ------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __RISCV_PACKED_SIMD_H
11#define __RISCV_PACKED_SIMD_H
12
13#include <stdint.h>
14
15#if defined(__cplusplus)
16extern "C" {
17#endif
18
19/* Packed SIMD Types */
20
21typedef int8_t int8x4_t __attribute__((__vector_size__(4)));
22typedef uint8_t uint8x4_t __attribute__((__vector_size__(4)));
23typedef int16_t int16x2_t __attribute__((__vector_size__(4)));
24typedef uint16_t uint16x2_t __attribute__((__vector_size__(4)));
25
26typedef int8_t int8x8_t __attribute__((__vector_size__(8)));
27typedef uint8_t uint8x8_t __attribute__((__vector_size__(8)));
28typedef int16_t int16x4_t __attribute__((__vector_size__(8)));
29typedef uint16_t uint16x4_t __attribute__((__vector_size__(8)));
30typedef int32_t int32x2_t __attribute__((__vector_size__(8)));
31typedef uint32_t uint32x2_t __attribute__((__vector_size__(8)));
32
33#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
34
35#define __packed_splat2(ty, x) ((ty){(x), (x)})
36#define __packed_splat4(ty, x) ((ty){(x), (x), (x), (x)})
37#define __packed_splat8(ty, x) ((ty){(x), (x), (x), (x), (x), (x), (x), (x)})
38
39#define __packed_splat(name, ty, scalar_ty, splat) \
40 static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(scalar_ty __x) { \
41 return splat(ty, __x); \
42 }
43
44#define __packed_shift(name, ty, op, mask) \
45 static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, \
46 unsigned __rs2) { \
47 return __rs1 op(__rs2 & (mask)); \
48 }
49#define __packed_shift8(name, ty, op) __packed_shift(name, ty, op, 0x7)
50#define __packed_shift16(name, ty, op) __packed_shift(name, ty, op, 0xf)
51#define __packed_shift32(name, ty, op) __packed_shift(name, ty, op, 0x1f)
52
53#define __packed_scalar_binary_op(name, ty, scalar_ty, op, splat) \
54 static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, \
55 scalar_ty __rs2) { \
56 return __rs1 op splat(ty, __rs2); \
57 }
58
59#define __packed_binary_op(name, ty, op) \
60 static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
61 return __rs1 op __rs2; \
62 }
63
64#define __packed_unary_op(name, ty, op) \
65 static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1) { \
66 return op __rs1; \
67 }
68
69#define __packed_binary_builtin(name, ty, builtin) \
70 static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
71 return builtin(__rs1, __rs2); \
72 }
73
74#define __packed_sh1add(name, ty) \
75 static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
76 return (__rs1 << 1) + __rs2; \
77 }
78
79/* TODO: switch to sadd_sat(__builtin_elementwise_shl_sat(a, 1), b) once a
80 * generic elementwise shl_sat builtin exists. sadd_sat(a, a) is equivalent
81 * for signed types and the backend's saturating_shl1 PatFrags matches both
82 * shapes. */
83#define __packed_sh1sadd(name, ty) \
84 static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
85 return __builtin_elementwise_add_sat( \
86 __builtin_elementwise_add_sat(__rs1, __rs1), __rs2); \
87 }
88
89// clang-format off: macro call sites have no trailing semicolons, which
90// confuses clang-format into a deeply nested expression.
91
92/* Packed Splat (32-bit) */
97
98/* Packed Splat (64-bit) */
105
106/* Packed Addition and Subtraction (32-bit) */
112__packed_binary_op(psub_u8x4, uint8x4_t, -)
114__packed_binary_op(psub_u16x2, uint16x2_t, -)
116__packed_unary_op(pneg_i16x2, int16x2_t, -)
117
118/* Packed Addition and Subtraction (64-bit) */
126__packed_binary_op(psub_u8x8, uint8x8_t, -)
128__packed_binary_op(psub_u16x4, uint16x4_t, -)
130__packed_binary_op(psub_u32x2, uint32x2_t, -)
132__packed_unary_op(pneg_i16x4, int16x4_t, -)
134
135/* Packed Addition with Scalar (32-bit) */
142
143/* Packed Addition with Scalar (64-bit) */
154
155/* Packed Saturating Addition and Subtraction (32-bit) */
156__packed_binary_builtin(psadd_i8x4, int8x4_t, __builtin_elementwise_add_sat)
157__packed_binary_builtin(psadd_i16x2, int16x2_t, __builtin_elementwise_add_sat)
158__packed_binary_builtin(psaddu_u8x4, uint8x4_t, __builtin_elementwise_add_sat)
159__packed_binary_builtin(psaddu_u16x2, uint16x2_t, __builtin_elementwise_add_sat)
160__packed_binary_builtin(pssub_i8x4, int8x4_t, __builtin_elementwise_sub_sat)
161__packed_binary_builtin(pssub_i16x2, int16x2_t, __builtin_elementwise_sub_sat)
162__packed_binary_builtin(pssubu_u8x4, uint8x4_t, __builtin_elementwise_sub_sat)
163__packed_binary_builtin(pssubu_u16x2, uint16x2_t, __builtin_elementwise_sub_sat)
164
165/* Packed Saturating Addition and Subtraction (64-bit) */
166__packed_binary_builtin(psadd_i8x8, int8x8_t, __builtin_elementwise_add_sat)
167__packed_binary_builtin(psadd_i16x4, int16x4_t, __builtin_elementwise_add_sat)
168__packed_binary_builtin(psadd_i32x2, int32x2_t, __builtin_elementwise_add_sat)
169__packed_binary_builtin(psaddu_u8x8, uint8x8_t, __builtin_elementwise_add_sat)
170__packed_binary_builtin(psaddu_u16x4, uint16x4_t, __builtin_elementwise_add_sat)
171__packed_binary_builtin(psaddu_u32x2, uint32x2_t, __builtin_elementwise_add_sat)
172__packed_binary_builtin(pssub_i8x8, int8x8_t, __builtin_elementwise_sub_sat)
173__packed_binary_builtin(pssub_i16x4, int16x4_t, __builtin_elementwise_sub_sat)
174__packed_binary_builtin(pssub_i32x2, int32x2_t, __builtin_elementwise_sub_sat)
175__packed_binary_builtin(pssubu_u8x8, uint8x8_t, __builtin_elementwise_sub_sat)
176__packed_binary_builtin(pssubu_u16x4, uint16x4_t, __builtin_elementwise_sub_sat)
177__packed_binary_builtin(pssubu_u32x2, uint32x2_t, __builtin_elementwise_sub_sat)
178
179/* Packed Shift-Add (32-bit) */
180__packed_sh1add(psh1add_i16x2, int16x2_t)
182__packed_sh1sadd(pssh1sadd_i16x2, int16x2_t)
183
184/* Packed Shift-Add (64-bit) */
186__packed_sh1add(psh1add_u16x4, uint16x4_t)
188__packed_sh1add(psh1add_u32x2, uint32x2_t)
190__packed_sh1sadd(pssh1sadd_i32x2, int32x2_t)
191
192/* Packed Minimum and Maximum (32-bit) */
193__packed_binary_builtin(pmin_i8x4, int8x4_t, __builtin_elementwise_min)
194__packed_binary_builtin(pmin_i16x2, int16x2_t, __builtin_elementwise_min)
195__packed_binary_builtin(pminu_u8x4, uint8x4_t, __builtin_elementwise_min)
196__packed_binary_builtin(pminu_u16x2, uint16x2_t, __builtin_elementwise_min)
197__packed_binary_builtin(pmax_i8x4, int8x4_t, __builtin_elementwise_max)
198__packed_binary_builtin(pmax_i16x2, int16x2_t, __builtin_elementwise_max)
199__packed_binary_builtin(pmaxu_u8x4, uint8x4_t, __builtin_elementwise_max)
200__packed_binary_builtin(pmaxu_u16x2, uint16x2_t, __builtin_elementwise_max)
201
202/* Packed Minimum and Maximum (64-bit) */
203__packed_binary_builtin(pmin_i8x8, int8x8_t, __builtin_elementwise_min)
204__packed_binary_builtin(pmin_i16x4, int16x4_t, __builtin_elementwise_min)
205__packed_binary_builtin(pmin_i32x2, int32x2_t, __builtin_elementwise_min)
206__packed_binary_builtin(pminu_u8x8, uint8x8_t, __builtin_elementwise_min)
207__packed_binary_builtin(pminu_u16x4, uint16x4_t, __builtin_elementwise_min)
208__packed_binary_builtin(pminu_u32x2, uint32x2_t, __builtin_elementwise_min)
209__packed_binary_builtin(pmax_i8x8, int8x8_t, __builtin_elementwise_max)
210__packed_binary_builtin(pmax_i16x4, int16x4_t, __builtin_elementwise_max)
211__packed_binary_builtin(pmax_i32x2, int32x2_t, __builtin_elementwise_max)
212__packed_binary_builtin(pmaxu_u8x8, uint8x8_t, __builtin_elementwise_max)
213__packed_binary_builtin(pmaxu_u16x4, uint16x4_t, __builtin_elementwise_max)
214__packed_binary_builtin(pmaxu_u32x2, uint32x2_t, __builtin_elementwise_max)
215
216/* Packed Shifts (32-bit) */
217__packed_shift8(psll_s_u8x4, uint8x4_t, <<)
218__packed_shift8(psll_s_i8x4, int8x4_t, <<)
219__packed_shift16(psll_s_u16x2, uint16x2_t, <<)
220__packed_shift16(psll_s_i16x2, int16x2_t, <<)
221__packed_shift8(psrl_s_u8x4, uint8x4_t, >>)
222__packed_shift16(psrl_s_u16x2, uint16x2_t, >>)
223__packed_shift8(psra_s_i8x4, int8x4_t, >>)
224__packed_shift16(psra_s_i16x2, int16x2_t, >>)
225
226/* Packed Shifts (64-bit) */
227__packed_shift8(psll_s_u8x8, uint8x8_t, <<)
228__packed_shift8(psll_s_i8x8, int8x8_t, <<)
229__packed_shift16(psll_s_u16x4, uint16x4_t, <<)
230__packed_shift16(psll_s_i16x4, int16x4_t, <<)
231__packed_shift32(psll_s_u32x2, uint32x2_t, <<)
232__packed_shift32(psll_s_i32x2, int32x2_t, <<)
233__packed_shift8(psrl_s_u8x8, uint8x8_t, >>)
234__packed_shift16(psrl_s_u16x4, uint16x4_t, >>)
235__packed_shift32(psrl_s_u32x2, uint32x2_t, >>)
236__packed_shift8(psra_s_i8x8, int8x8_t, >>)
237__packed_shift16(psra_s_i16x4, int16x4_t, >>)
238__packed_shift32(psra_s_i32x2, int32x2_t, >>)
239
240/* Packed Logical Operations (32-bit) */
242__packed_binary_op(pand_u8x4, uint8x4_t, &)
244__packed_binary_op(pand_u16x2, uint16x2_t, &)
246__packed_binary_op(por_u8x4, uint8x4_t, |)
248__packed_binary_op(por_u16x2, uint16x2_t, |)
250__packed_binary_op(pxor_u8x4, uint8x4_t, ^)
252__packed_binary_op(pxor_u16x2, uint16x2_t, ^)
254__packed_unary_op(pnot_u8x4, uint8x4_t, ~)
256__packed_unary_op(pnot_u16x2, uint16x2_t, ~)
257
258/* Packed Logical Operations (64-bit) */
260__packed_binary_op(pand_u8x8, uint8x8_t, &)
262__packed_binary_op(pand_u16x4, uint16x4_t, &)
264__packed_binary_op(pand_u32x2, uint32x2_t, &)
266__packed_binary_op(por_u8x8, uint8x8_t, |)
268__packed_binary_op(por_u16x4, uint16x4_t, |)
270__packed_binary_op(por_u32x2, uint32x2_t, |)
272__packed_binary_op(pxor_u8x8, uint8x8_t, ^)
274__packed_binary_op(pxor_u16x4, uint16x4_t, ^)
276__packed_binary_op(pxor_u32x2, uint32x2_t, ^)
278__packed_unary_op(pnot_u8x8, uint8x8_t, ~)
280__packed_unary_op(pnot_u16x4, uint16x4_t, ~)
282__packed_unary_op(pnot_u32x2, uint32x2_t, ~)
283
284// clang-format on
285
286#undef __packed_splat2
287#undef __packed_splat4
288#undef __packed_splat8
289#undef __packed_splat
290#undef __packed_shift
291#undef __packed_shift8
292#undef __packed_shift16
293#undef __packed_shift32
294#undef __packed_scalar_binary_op
295#undef __packed_binary_op
296#undef __packed_unary_op
297#undef __packed_binary_builtin
298#undef __packed_sh1add
299#undef __packed_sh1sadd
300#undef __DEFAULT_FN_ATTRS
301
302#if defined(__cplusplus)
303}
304#endif
305
306#endif /* __RISCV_PACKED_SIMD_H */
#define __packed_binary_op(name, ty, op)
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 uint8x8_t
#define __packed_splat2(ty, x)
int8_t int8x4_t __attribute__((__vector_size__(4)))
#define __packed_shift8(name, ty, op)
__packed_splat4 __packed_splat2 __packed_splat8 int16x4_t
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 uint16x4_t
#define __packed_splat8(ty, x)
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 int32_t
__packed_splat4 int16_t
#define __packed_scalar_binary_op(name, ty, scalar_ty, op, splat)
#define __packed_shift16(name, ty, op)
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 uint8_t
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 __packed_splat4 uint16_t
#define __packed_binary_builtin(name, ty, builtin)
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 __packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 uint32_t
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 uint16x2_t
#define __packed_unary_op(name, ty, op)
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 uint8x4_t
__packed_splat4 int16x2_t
#define __packed_sh1add(name, ty)
#define __packed_sh1sadd(name, ty)
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 uint32x2_t
#define __packed_shift32(name, ty, op)
#define __packed_splat(name, ty, scalar_ty, splat)
__packed_splat4 __packed_splat2 int8x8_t
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 int32x2_t
#define __packed_splat4(ty, x)