clang 23.0.0git
riscv_packed_simd.h
Go to the documentation of this file.
1/*===---- riscv_packed_simd.h - RISC-V P intrinsics ------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __RISCV_PACKED_SIMD_H
11#define __RISCV_PACKED_SIMD_H
12
13#include <stdint.h>
14
15#if defined(__cplusplus)
16extern "C" {
17#endif
18
19/* Packed SIMD Types */
20
21typedef int8_t int8x4_t __attribute__((__vector_size__(4)));
22typedef uint8_t uint8x4_t __attribute__((__vector_size__(4)));
23typedef int16_t int16x2_t __attribute__((__vector_size__(4)));
24typedef uint16_t uint16x2_t __attribute__((__vector_size__(4)));
25
26typedef int8_t int8x8_t __attribute__((__vector_size__(8)));
27typedef uint8_t uint8x8_t __attribute__((__vector_size__(8)));
28typedef int16_t int16x4_t __attribute__((__vector_size__(8)));
29typedef uint16_t uint16x4_t __attribute__((__vector_size__(8)));
30typedef int32_t int32x2_t __attribute__((__vector_size__(8)));
31typedef uint32_t uint32x2_t __attribute__((__vector_size__(8)));
32
33#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
34
35#define __packed_splat2(ty, x) ((ty){(x), (x)})
36#define __packed_splat4(ty, x) ((ty){(x), (x), (x), (x)})
37#define __packed_splat8(ty, x) ((ty){(x), (x), (x), (x), (x), (x), (x), (x)})
38
39#define __packed_splat(name, ty, scalar_ty, splat) \
40 static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(scalar_ty __x) { \
41 return splat(ty, __x); \
42 }
43
44#define __packed_shift(name, ty, op, mask) \
45 static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, \
46 unsigned __rs2) { \
47 return __rs1 op(__rs2 & (mask)); \
48 }
49#define __packed_shift8(name, ty, op) __packed_shift(name, ty, op, 0x7)
50#define __packed_shift16(name, ty, op) __packed_shift(name, ty, op, 0xf)
51#define __packed_shift32(name, ty, op) __packed_shift(name, ty, op, 0x1f)
52
53#define __packed_scalar_binary_op(name, ty, scalar_ty, op, splat) \
54 static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, \
55 scalar_ty __rs2) { \
56 return __rs1 op splat(ty, __rs2); \
57 }
58
59#define __packed_binary_op(name, ty, op) \
60 static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
61 return __rs1 op __rs2; \
62 }
63
64#define __packed_unary_op(name, ty, op) \
65 static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1) { \
66 return op __rs1; \
67 }
68
69#define __packed_binary_builtin(name, ty, builtin) \
70 static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
71 return builtin(__rs1, __rs2); \
72 }
73
74#define __packed_sh1add(name, ty) \
75 static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
76 return (__rs1 << 1) + __rs2; \
77 }
78
79/* TODO: switch to sadd_sat(__builtin_elementwise_shl_sat(a, 1), b) once a
80 * generic elementwise shl_sat builtin exists. sadd_sat(a, a) is equivalent
81 * for signed types and the backend's saturating_shl1 PatFrags matches both
82 * shapes. */
83#define __packed_sh1sadd(name, ty) \
84 static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
85 return __builtin_elementwise_add_sat( \
86 __builtin_elementwise_add_sat(__rs1, __rs1), __rs2); \
87 }
88
89#define __packed_cmp(name, ty, rty, op) \
90 static __inline__ rty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, \
91 ty __rs2) { \
92 return (rty)(__rs1 op __rs2); \
93 }
94
95#define __packed_pabs(name, ty, rty) \
96 static __inline__ rty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1) { \
97 return (rty)__builtin_elementwise_abs(__rs1); \
98 }
99
100#define __packed_binary_builtin_cast(name, ty, rty, builtin) \
101 static __inline__ rty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, \
102 ty __rs2) { \
103 return (rty)builtin(__rs1, __rs2); \
104 }
105
106#define __packed_reduction(name, rty, ty, builtin) \
107 static __inline__ rty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, \
108 rty __rs2) { \
109 return builtin(__rs1, __rs2); \
110 }
111
112// clang-format off: macro call sites have no trailing semicolons, which
113// confuses clang-format into a deeply nested expression.
114
115/* Packed Splat (32-bit) */
120
121/* Packed Splat (64-bit) */
128
129/* Packed Addition and Subtraction (32-bit) */
135__packed_binary_op(psub_u8x4, uint8x4_t, -)
137__packed_binary_op(psub_u16x2, uint16x2_t, -)
139__packed_unary_op(pneg_i16x2, int16x2_t, -)
140
141/* Packed Addition and Subtraction (64-bit) */
149__packed_binary_op(psub_u8x8, uint8x8_t, -)
151__packed_binary_op(psub_u16x4, uint16x4_t, -)
153__packed_binary_op(psub_u32x2, uint32x2_t, -)
155__packed_unary_op(pneg_i16x4, int16x4_t, -)
157
158/* Packed Addition with Scalar (32-bit) */
165
166/* Packed Addition with Scalar (64-bit) */
177
178/* Packed Saturating Addition and Subtraction (32-bit) */
179__packed_binary_builtin(psadd_i8x4, int8x4_t, __builtin_elementwise_add_sat)
180__packed_binary_builtin(psadd_i16x2, int16x2_t, __builtin_elementwise_add_sat)
181__packed_binary_builtin(psaddu_u8x4, uint8x4_t, __builtin_elementwise_add_sat)
182__packed_binary_builtin(psaddu_u16x2, uint16x2_t, __builtin_elementwise_add_sat)
183__packed_binary_builtin(pssub_i8x4, int8x4_t, __builtin_elementwise_sub_sat)
184__packed_binary_builtin(pssub_i16x2, int16x2_t, __builtin_elementwise_sub_sat)
185__packed_binary_builtin(pssubu_u8x4, uint8x4_t, __builtin_elementwise_sub_sat)
186__packed_binary_builtin(pssubu_u16x2, uint16x2_t, __builtin_elementwise_sub_sat)
187
188/* Packed Saturating Addition and Subtraction (64-bit) */
189__packed_binary_builtin(psadd_i8x8, int8x8_t, __builtin_elementwise_add_sat)
190__packed_binary_builtin(psadd_i16x4, int16x4_t, __builtin_elementwise_add_sat)
191__packed_binary_builtin(psadd_i32x2, int32x2_t, __builtin_elementwise_add_sat)
192__packed_binary_builtin(psaddu_u8x8, uint8x8_t, __builtin_elementwise_add_sat)
193__packed_binary_builtin(psaddu_u16x4, uint16x4_t, __builtin_elementwise_add_sat)
194__packed_binary_builtin(psaddu_u32x2, uint32x2_t, __builtin_elementwise_add_sat)
195__packed_binary_builtin(pssub_i8x8, int8x8_t, __builtin_elementwise_sub_sat)
196__packed_binary_builtin(pssub_i16x4, int16x4_t, __builtin_elementwise_sub_sat)
197__packed_binary_builtin(pssub_i32x2, int32x2_t, __builtin_elementwise_sub_sat)
198__packed_binary_builtin(pssubu_u8x8, uint8x8_t, __builtin_elementwise_sub_sat)
199__packed_binary_builtin(pssubu_u16x4, uint16x4_t, __builtin_elementwise_sub_sat)
200__packed_binary_builtin(pssubu_u32x2, uint32x2_t, __builtin_elementwise_sub_sat)
201
202/* Packed Shift-Add (32-bit) */
203__packed_sh1add(psh1add_i16x2, int16x2_t)
205__packed_sh1sadd(pssh1sadd_i16x2, int16x2_t)
206
207/* Packed Shift-Add (64-bit) */
209__packed_sh1add(psh1add_u16x4, uint16x4_t)
211__packed_sh1add(psh1add_u32x2, uint32x2_t)
213__packed_sh1sadd(pssh1sadd_i32x2, int32x2_t)
214
215/* Packed Exchanged Addition and Subtraction (32-bit) */
216__packed_binary_builtin(pas_x_i16x2, int16x2_t, __builtin_riscv_pas_x_i16x2)
217__packed_binary_builtin(psa_x_i16x2, int16x2_t, __builtin_riscv_psa_x_i16x2)
218__packed_binary_builtin(psas_x_i16x2, int16x2_t, __builtin_riscv_psas_x_i16x2)
219__packed_binary_builtin(pssa_x_i16x2, int16x2_t, __builtin_riscv_pssa_x_i16x2)
220__packed_binary_builtin(paas_x_i16x2, int16x2_t, __builtin_riscv_paas_x_i16x2)
221__packed_binary_builtin(pasa_x_i16x2, int16x2_t, __builtin_riscv_pasa_x_i16x2)
222
223/* Packed Exchanged Addition and Subtraction (64-bit) */
224__packed_binary_builtin(pas_x_i16x4, int16x4_t, __builtin_riscv_pas_x_i16x4)
225__packed_binary_builtin(psa_x_i16x4, int16x4_t, __builtin_riscv_psa_x_i16x4)
226__packed_binary_builtin(psas_x_i16x4, int16x4_t, __builtin_riscv_psas_x_i16x4)
227__packed_binary_builtin(pssa_x_i16x4, int16x4_t, __builtin_riscv_pssa_x_i16x4)
228__packed_binary_builtin(paas_x_i16x4, int16x4_t, __builtin_riscv_paas_x_i16x4)
229__packed_binary_builtin(pasa_x_i16x4, int16x4_t, __builtin_riscv_pasa_x_i16x4)
230__packed_binary_builtin(pas_x_i32x2, int32x2_t, __builtin_riscv_pas_x_i32x2)
231__packed_binary_builtin(psa_x_i32x2, int32x2_t, __builtin_riscv_psa_x_i32x2)
232__packed_binary_builtin(psas_x_i32x2, int32x2_t, __builtin_riscv_psas_x_i32x2)
233__packed_binary_builtin(pssa_x_i32x2, int32x2_t, __builtin_riscv_pssa_x_i32x2)
234__packed_binary_builtin(paas_x_i32x2, int32x2_t, __builtin_riscv_paas_x_i32x2)
235__packed_binary_builtin(pasa_x_i32x2, int32x2_t, __builtin_riscv_pasa_x_i32x2)
236
237/* Packed Minimum and Maximum (32-bit) */
238__packed_binary_builtin(pmin_i8x4, int8x4_t, __builtin_elementwise_min)
239__packed_binary_builtin(pmin_i16x2, int16x2_t, __builtin_elementwise_min)
240__packed_binary_builtin(pminu_u8x4, uint8x4_t, __builtin_elementwise_min)
241__packed_binary_builtin(pminu_u16x2, uint16x2_t, __builtin_elementwise_min)
242__packed_binary_builtin(pmax_i8x4, int8x4_t, __builtin_elementwise_max)
243__packed_binary_builtin(pmax_i16x2, int16x2_t, __builtin_elementwise_max)
244__packed_binary_builtin(pmaxu_u8x4, uint8x4_t, __builtin_elementwise_max)
245__packed_binary_builtin(pmaxu_u16x2, uint16x2_t, __builtin_elementwise_max)
246
247/* Packed Minimum and Maximum (64-bit) */
248__packed_binary_builtin(pmin_i8x8, int8x8_t, __builtin_elementwise_min)
249__packed_binary_builtin(pmin_i16x4, int16x4_t, __builtin_elementwise_min)
250__packed_binary_builtin(pmin_i32x2, int32x2_t, __builtin_elementwise_min)
251__packed_binary_builtin(pminu_u8x8, uint8x8_t, __builtin_elementwise_min)
252__packed_binary_builtin(pminu_u16x4, uint16x4_t, __builtin_elementwise_min)
253__packed_binary_builtin(pminu_u32x2, uint32x2_t, __builtin_elementwise_min)
254__packed_binary_builtin(pmax_i8x8, int8x8_t, __builtin_elementwise_max)
255__packed_binary_builtin(pmax_i16x4, int16x4_t, __builtin_elementwise_max)
256__packed_binary_builtin(pmax_i32x2, int32x2_t, __builtin_elementwise_max)
257__packed_binary_builtin(pmaxu_u8x8, uint8x8_t, __builtin_elementwise_max)
258__packed_binary_builtin(pmaxu_u16x4, uint16x4_t, __builtin_elementwise_max)
259__packed_binary_builtin(pmaxu_u32x2, uint32x2_t, __builtin_elementwise_max)
260
261/* Packed Comparison (32-bit) */
262__packed_cmp(pmseq_i8x4_u8x4, int8x4_t, uint8x4_t, ==)
263__packed_cmp(pmseq_u8x4_u8x4, uint8x4_t, uint8x4_t, ==)
264__packed_cmp(pmsne_i8x4_u8x4, int8x4_t, uint8x4_t, !=)
265__packed_cmp(pmsne_u8x4_u8x4, uint8x4_t, uint8x4_t, !=)
266__packed_cmp(pmslt_u8x4, int8x4_t, uint8x4_t, <)
267__packed_cmp(pmsltu_u8x4, uint8x4_t, uint8x4_t, <)
268__packed_cmp(pmsgt_u8x4, int8x4_t, uint8x4_t, >)
269__packed_cmp(pmsgtu_u8x4, uint8x4_t, uint8x4_t, >)
270__packed_cmp(pmsge_u8x4, int8x4_t, uint8x4_t, >=)
271__packed_cmp(pmsgeu_u8x4, uint8x4_t, uint8x4_t, >=)
272__packed_cmp(pmsle_u8x4, int8x4_t, uint8x4_t, <=)
273__packed_cmp(pmsleu_u8x4, uint8x4_t, uint8x4_t, <=)
274__packed_cmp(pmseq_i16x2_u16x2, int16x2_t, uint16x2_t, ==)
275__packed_cmp(pmseq_u16x2_u16x2, uint16x2_t, uint16x2_t, ==)
276__packed_cmp(pmsne_i16x2_u16x2, int16x2_t, uint16x2_t, !=)
277__packed_cmp(pmsne_u16x2_u16x2, uint16x2_t, uint16x2_t, !=)
278__packed_cmp(pmslt_u16x2, int16x2_t, uint16x2_t, <)
279__packed_cmp(pmsltu_u16x2, uint16x2_t, uint16x2_t, <)
280__packed_cmp(pmsgt_u16x2, int16x2_t, uint16x2_t, >)
281__packed_cmp(pmsgtu_u16x2, uint16x2_t, uint16x2_t, >)
282__packed_cmp(pmsge_u16x2, int16x2_t, uint16x2_t, >=)
283__packed_cmp(pmsgeu_u16x2, uint16x2_t, uint16x2_t, >=)
284__packed_cmp(pmsle_u16x2, int16x2_t, uint16x2_t, <=)
285__packed_cmp(pmsleu_u16x2, uint16x2_t, uint16x2_t, <=)
286
287/* Packed Comparison (64-bit) */
288__packed_cmp(pmseq_i8x8_u8x8, int8x8_t, uint8x8_t, ==)
289__packed_cmp(pmseq_u8x8_u8x8, uint8x8_t, uint8x8_t, ==)
290__packed_cmp(pmsne_i8x8_u8x8, int8x8_t, uint8x8_t, !=)
291__packed_cmp(pmsne_u8x8_u8x8, uint8x8_t, uint8x8_t, !=)
292__packed_cmp(pmslt_u8x8, int8x8_t, uint8x8_t, <)
293__packed_cmp(pmsltu_u8x8, uint8x8_t, uint8x8_t, <)
294__packed_cmp(pmsgt_u8x8, int8x8_t, uint8x8_t, >)
295__packed_cmp(pmsgtu_u8x8, uint8x8_t, uint8x8_t, >)
296__packed_cmp(pmsge_u8x8, int8x8_t, uint8x8_t, >=)
297__packed_cmp(pmsgeu_u8x8, uint8x8_t, uint8x8_t, >=)
298__packed_cmp(pmsle_u8x8, int8x8_t, uint8x8_t, <=)
299__packed_cmp(pmsleu_u8x8, uint8x8_t, uint8x8_t, <=)
300__packed_cmp(pmseq_i16x4_u16x4, int16x4_t, uint16x4_t, ==)
301__packed_cmp(pmseq_u16x4_u16x4, uint16x4_t, uint16x4_t, ==)
302__packed_cmp(pmsne_i16x4_u16x4, int16x4_t, uint16x4_t, !=)
303__packed_cmp(pmsne_u16x4_u16x4, uint16x4_t, uint16x4_t, !=)
304__packed_cmp(pmslt_u16x4, int16x4_t, uint16x4_t, <)
305__packed_cmp(pmsltu_u16x4, uint16x4_t, uint16x4_t, <)
306__packed_cmp(pmsgt_u16x4, int16x4_t, uint16x4_t, >)
307__packed_cmp(pmsgtu_u16x4, uint16x4_t, uint16x4_t, >)
308__packed_cmp(pmsge_u16x4, int16x4_t, uint16x4_t, >=)
309__packed_cmp(pmsgeu_u16x4, uint16x4_t, uint16x4_t, >=)
310__packed_cmp(pmsle_u16x4, int16x4_t, uint16x4_t, <=)
311__packed_cmp(pmsleu_u16x4, uint16x4_t, uint16x4_t, <=)
312__packed_cmp(pmseq_i32x2_u32x2, int32x2_t, uint32x2_t, ==)
313__packed_cmp(pmseq_u32x2_u32x2, uint32x2_t, uint32x2_t, ==)
314__packed_cmp(pmsne_i32x2_u32x2, int32x2_t, uint32x2_t, !=)
315__packed_cmp(pmsne_u32x2_u32x2, uint32x2_t, uint32x2_t, !=)
316__packed_cmp(pmslt_u32x2, int32x2_t, uint32x2_t, <)
317__packed_cmp(pmsltu_u32x2, uint32x2_t, uint32x2_t, <)
318__packed_cmp(pmsgt_u32x2, int32x2_t, uint32x2_t, >)
319__packed_cmp(pmsgtu_u32x2, uint32x2_t, uint32x2_t, >)
320__packed_cmp(pmsge_u32x2, int32x2_t, uint32x2_t, >=)
321__packed_cmp(pmsgeu_u32x2, uint32x2_t, uint32x2_t, >=)
322__packed_cmp(pmsle_u32x2, int32x2_t, uint32x2_t, <=)
323__packed_cmp(pmsleu_u32x2, uint32x2_t, uint32x2_t, <=)
324
325/* Packed Shifts (32-bit) */
326__packed_shift8(psll_s_u8x4, uint8x4_t, <<)
327__packed_shift8(psll_s_i8x4, int8x4_t, <<)
328__packed_shift16(psll_s_u16x2, uint16x2_t, <<)
329__packed_shift16(psll_s_i16x2, int16x2_t, <<)
330__packed_shift8(psrl_s_u8x4, uint8x4_t, >>)
331__packed_shift16(psrl_s_u16x2, uint16x2_t, >>)
332__packed_shift8(psra_s_i8x4, int8x4_t, >>)
333__packed_shift16(psra_s_i16x2, int16x2_t, >>)
334
335/* Packed Shifts (64-bit) */
336__packed_shift8(psll_s_u8x8, uint8x8_t, <<)
337__packed_shift8(psll_s_i8x8, int8x8_t, <<)
338__packed_shift16(psll_s_u16x4, uint16x4_t, <<)
339__packed_shift16(psll_s_i16x4, int16x4_t, <<)
340__packed_shift32(psll_s_u32x2, uint32x2_t, <<)
341__packed_shift32(psll_s_i32x2, int32x2_t, <<)
342__packed_shift8(psrl_s_u8x8, uint8x8_t, >>)
343__packed_shift16(psrl_s_u16x4, uint16x4_t, >>)
344__packed_shift32(psrl_s_u32x2, uint32x2_t, >>)
345__packed_shift8(psra_s_i8x8, int8x8_t, >>)
346__packed_shift16(psra_s_i16x4, int16x4_t, >>)
347__packed_shift32(psra_s_i32x2, int32x2_t, >>)
348
349/* Packed Logical Operations (32-bit) */
350__packed_binary_op(pand_i8x4, int8x4_t, &)
351__packed_binary_op(pand_u8x4, uint8x4_t, &)
352__packed_binary_op(pand_i16x2, int16x2_t, &)
353__packed_binary_op(pand_u16x2, uint16x2_t, &)
354__packed_binary_op(por_i8x4, int8x4_t, |)
355__packed_binary_op(por_u8x4, uint8x4_t, |)
356__packed_binary_op(por_i16x2, int16x2_t, |)
357__packed_binary_op(por_u16x2, uint16x2_t, |)
358__packed_binary_op(pxor_i8x4, int8x4_t, ^)
359__packed_binary_op(pxor_u8x4, uint8x4_t, ^)
360__packed_binary_op(pxor_i16x2, int16x2_t, ^)
361__packed_binary_op(pxor_u16x2, uint16x2_t, ^)
362__packed_unary_op(pnot_i8x4, int8x4_t, ~)
363__packed_unary_op(pnot_u8x4, uint8x4_t, ~)
364__packed_unary_op(pnot_i16x2, int16x2_t, ~)
365__packed_unary_op(pnot_u16x2, uint16x2_t, ~)
366
367/* Packed Logical Operations (64-bit) */
368__packed_binary_op(pand_i8x8, int8x8_t, &)
369__packed_binary_op(pand_u8x8, uint8x8_t, &)
370__packed_binary_op(pand_i16x4, int16x4_t, &)
371__packed_binary_op(pand_u16x4, uint16x4_t, &)
372__packed_binary_op(pand_i32x2, int32x2_t, &)
373__packed_binary_op(pand_u32x2, uint32x2_t, &)
374__packed_binary_op(por_i8x8, int8x8_t, |)
375__packed_binary_op(por_u8x8, uint8x8_t, |)
376__packed_binary_op(por_i16x4, int16x4_t, |)
377__packed_binary_op(por_u16x4, uint16x4_t, |)
378__packed_binary_op(por_i32x2, int32x2_t, |)
379__packed_binary_op(por_u32x2, uint32x2_t, |)
380__packed_binary_op(pxor_i8x8, int8x8_t, ^)
381__packed_binary_op(pxor_u8x8, uint8x8_t, ^)
382__packed_binary_op(pxor_i16x4, int16x4_t, ^)
383__packed_binary_op(pxor_u16x4, uint16x4_t, ^)
384__packed_binary_op(pxor_i32x2, int32x2_t, ^)
385__packed_binary_op(pxor_u32x2, uint32x2_t, ^)
386__packed_unary_op(pnot_i8x8, int8x8_t, ~)
387__packed_unary_op(pnot_u8x8, uint8x8_t, ~)
388__packed_unary_op(pnot_i16x4, int16x4_t, ~)
389__packed_unary_op(pnot_u16x4, uint16x4_t, ~)
390__packed_unary_op(pnot_i32x2, int32x2_t, ~)
391__packed_unary_op(pnot_u32x2, uint32x2_t, ~)
392
393/* Packed Averaging Addition and Subtraction (32-bit) */
394__packed_binary_builtin(paadd_i8x4, int8x4_t, __builtin_riscv_paadd_i8x4)
395__packed_binary_builtin(paadd_i16x2, int16x2_t, __builtin_riscv_paadd_i16x2)
396__packed_binary_builtin(paaddu_u8x4, uint8x4_t, __builtin_riscv_paaddu_u8x4)
397__packed_binary_builtin(paaddu_u16x2, uint16x2_t, __builtin_riscv_paaddu_u16x2)
398__packed_binary_builtin(pasub_i8x4, int8x4_t, __builtin_riscv_pasub_i8x4)
399__packed_binary_builtin(pasub_i16x2, int16x2_t, __builtin_riscv_pasub_i16x2)
400__packed_binary_builtin(pasubu_u8x4, uint8x4_t, __builtin_riscv_pasubu_u8x4)
401__packed_binary_builtin(pasubu_u16x2, uint16x2_t, __builtin_riscv_pasubu_u16x2)
402
403/* Packed Averaging Addition and Subtraction (64-bit) */
404__packed_binary_builtin(paadd_i8x8, int8x8_t, __builtin_riscv_paadd_i8x8)
405__packed_binary_builtin(paadd_i16x4, int16x4_t, __builtin_riscv_paadd_i16x4)
406__packed_binary_builtin(paadd_i32x2, int32x2_t, __builtin_riscv_paadd_i32x2)
407__packed_binary_builtin(paaddu_u8x8, uint8x8_t, __builtin_riscv_paaddu_u8x8)
408__packed_binary_builtin(paaddu_u16x4, uint16x4_t, __builtin_riscv_paaddu_u16x4)
409__packed_binary_builtin(paaddu_u32x2, uint32x2_t, __builtin_riscv_paaddu_u32x2)
410__packed_binary_builtin(pasub_i8x8, int8x8_t, __builtin_riscv_pasub_i8x8)
411__packed_binary_builtin(pasub_i16x4, int16x4_t, __builtin_riscv_pasub_i16x4)
412__packed_binary_builtin(pasub_i32x2, int32x2_t, __builtin_riscv_pasub_i32x2)
413__packed_binary_builtin(pasubu_u8x8, uint8x8_t, __builtin_riscv_pasubu_u8x8)
414__packed_binary_builtin(pasubu_u16x4, uint16x4_t, __builtin_riscv_pasubu_u16x4)
415__packed_binary_builtin(pasubu_u32x2, uint32x2_t, __builtin_riscv_pasubu_u32x2)
416
417/* Packed Absolute Value and Absolute Difference (32-bit) */
420__packed_binary_builtin_cast(pabd_i8x4, int8x4_t, uint8x4_t, __builtin_riscv_pabd_i8x4)
421__packed_binary_builtin_cast(pabd_i16x2, int16x2_t, uint16x2_t, __builtin_riscv_pabd_i16x2)
422__packed_binary_builtin_cast(pabdu_u8x4, uint8x4_t, uint8x4_t, __builtin_riscv_pabdu_u8x4)
423__packed_binary_builtin_cast(pabdu_u16x2, uint16x2_t, uint16x2_t, __builtin_riscv_pabdu_u16x2)
424
425/* Packed Absolute Value and Absolute Difference (64-bit) */
428__packed_binary_builtin_cast(pabd_i8x8, int8x8_t, uint8x8_t, __builtin_riscv_pabd_i8x8)
429__packed_binary_builtin_cast(pabd_i16x4, int16x4_t, uint16x4_t, __builtin_riscv_pabd_i16x4)
430__packed_binary_builtin_cast(pabdu_u8x8, uint8x8_t, uint8x8_t, __builtin_riscv_pabdu_u8x8)
431__packed_binary_builtin_cast(pabdu_u16x4, uint16x4_t, uint16x4_t, __builtin_riscv_pabdu_u16x4)
432
433/* Packed Reduction Sum (32-bit) */
434__packed_reduction(predsum_i8x4_i32, int32_t, int8x4_t, __builtin_riscv_predsum_i8x4_i32)
435__packed_reduction(predsumu_u8x4_u32, uint32_t, uint8x4_t, __builtin_riscv_predsumu_u8x4_u32)
436__packed_reduction(predsum_i16x2_i32, int32_t, int16x2_t, __builtin_riscv_predsum_i16x2_i32)
437__packed_reduction(predsumu_u16x2_u32, uint32_t, uint16x2_t, __builtin_riscv_predsumu_u16x2_u32)
438
439/* Packed Reduction Sum (64-bit) */
440__packed_reduction(predsum_i8x8_i32, int32_t, int8x8_t, __builtin_riscv_predsum_i8x8_i32)
441__packed_reduction(predsumu_u8x8_u32, uint32_t, uint8x8_t, __builtin_riscv_predsumu_u8x8_u32)
442__packed_reduction(predsum_i16x4_i32, int32_t, int16x4_t, __builtin_riscv_predsum_i16x4_i32)
443__packed_reduction(predsumu_u16x4_u32, uint32_t, uint16x4_t, __builtin_riscv_predsumu_u16x4_u32)
444__packed_reduction(predsum_i8x8_i64, int64_t, int8x8_t, __builtin_riscv_predsum_i8x8_i64)
445__packed_reduction(predsumu_u8x8_u64, uint64_t, uint8x8_t, __builtin_riscv_predsumu_u8x8_u64)
446__packed_reduction(predsum_i16x4_i64, int64_t, int16x4_t, __builtin_riscv_predsum_i16x4_i64)
447__packed_reduction(predsumu_u16x4_u64, uint64_t, uint16x4_t, __builtin_riscv_predsumu_u16x4_u64)
448__packed_reduction(predsum_i32x2_i64, int64_t, int32x2_t, __builtin_riscv_predsum_i32x2_i64)
449__packed_reduction(predsumu_u32x2_u64, uint64_t, uint32x2_t, __builtin_riscv_predsumu_u32x2_u64)
450
451// clang-format on
452
453#undef __packed_splat2
454#undef __packed_splat4
455#undef __packed_splat8
456#undef __packed_splat
457#undef __packed_shift
458#undef __packed_shift8
459#undef __packed_shift16
460#undef __packed_shift32
461#undef __packed_scalar_binary_op
462#undef __packed_binary_op
463#undef __packed_unary_op
464#undef __packed_binary_builtin
465#undef __packed_sh1add
466#undef __packed_sh1sadd
467#undef __packed_cmp
468#undef __packed_pabs
469#undef __packed_binary_builtin_cast
470#undef __packed_reduction
471#undef __DEFAULT_FN_ATTRS
472
473#if defined(__cplusplus)
474}
475#endif
476
477#endif /* __RISCV_PACKED_SIMD_H */
#define __packed_cmp(name, ty, rty, op)
__packed_splat4 int16x2_t
#define __packed_binary_op(name, ty, op)
#define __packed_splat2(ty, x)
int8_t int8x4_t __attribute__((__vector_size__(4)))
#define __packed_reduction(name, rty, ty, builtin)
#define __packed_shift8(name, ty, op)
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 uint16x4_t
#define __packed_splat8(ty, x)
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 int32_t
__packed_splat4 int16_t
#define __packed_scalar_binary_op(name, ty, scalar_ty, op, splat)
#define __packed_shift16(name, ty, op)
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 uint8_t
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 __packed_splat4 uint16_t
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 uint32x2_t
#define __packed_binary_builtin(name, ty, builtin)
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 __packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 uint32_t
#define __packed_unary_op(name, ty, op)
#define __packed_sh1add(name, ty)
#define __packed_pabs(name, ty, rty)
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 uint16x2_t
#define __packed_sh1sadd(name, ty)
#define __packed_shift32(name, ty, op)
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 int32x2_t
#define __packed_splat(name, ty, scalar_ty, splat)
__packed_splat4 __packed_splat2 int8x8_t
#define __packed_binary_builtin_cast(name, ty, rty, builtin)
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 uint8x4_t
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 uint8x8_t
#define __packed_splat4(ty, x)
__packed_splat4 __packed_splat2 __packed_splat8 int16x4_t