clang 23.0.0git
arm_acle.h
Go to the documentation of this file.
1/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 * The Arm C Language Extensions specifications can be found in the following
8 * link: https://github.com/ARM-software/acle/releases
9 *
10 * The ACLE section numbers are subject to change. When consulting the
11 * specifications, it is recommended to search using section titles if
12 * the section numbers look outdated.
13 *
14 *===-----------------------------------------------------------------------===
15 */
16
17#ifndef __ARM_ACLE_H
18#define __ARM_ACLE_H
19
20#ifndef __ARM_ACLE
21#error "ACLE intrinsics support not enabled."
22#endif
23
24#include <stdint.h>
25
26#if defined(__cplusplus)
27extern "C" {
28#endif
29
30/* 7 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
31/* 7.3 Memory barriers */
32void __dmb(unsigned int);
33void __dsb(unsigned int);
34void __isb(unsigned int);
35
36/* 7.4 Hints */
37void __wfi(void);
38void __wfe(void);
39void __sev(void);
40void __sevl(void);
41void __yield(void);
42
43#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
44#define __dbg(t) __builtin_arm_dbg(t)
45#endif
46
47#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
48#define _CHKFEAT_GCS 1
49static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
50__chkfeat(uint64_t __features) {
51 return __builtin_arm_chkfeat(__features) ^ __features;
52}
53#endif
54
55/* 7.5 Swap */
56static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
57__swp(uint32_t __x, volatile uint32_t *__p) {
58 uint32_t __v;
59#if (__ARM_FEATURE_LDREX & 4) || __ARM_ARCH_6M__ || __linux__
60 /*
61 * Using this clang builtin is sensible in most situations. Where
62 * LDREX and STREX are available, it will compile to a loop using
63 * them. Otherwise it will compile to a libcall, requiring the
64 * runtime to provide that library function.
65 *
66 * That's unavoidable on Armv6-M, which has no atomic instructions
67 * at all (not even SWP), so in that situation the user will just
68 * have to provide an implementation of __atomic_exchange_4 (perhaps
69 * it would temporarily disable interrupts, and then do a separate
70 * load and store).
71 *
72 * We also use the libcall strategy on pre-Armv7 Linux targets, on
73 * the theory that Linux's runtime support library _will_ provide a
74 * suitable libcall, and it's better to use that than the SWP
75 * instruction because then when the same binary is run on a later
76 * Linux system the libcall implementation will use LDREX instead.
77 */
78 __v = __atomic_exchange_n(__p, __x, __ATOMIC_RELAXED);
79#else
80 /*
81 * But for older Arm architectures when the target is not Linux, we
82 * fall back to using the SWP instruction via inline assembler. ACLE
83 * is clear that we're allowed to do this, but shouldn't do it if we
84 * have a better alternative.
85 */
86 __asm__("swp %0, %1, [%2]" : "=r"(__v) : "r"(__x), "r"(__p) : "memory");
87#endif
88 return __v;
89}
90
91/* 7.6 Memory prefetch intrinsics */
92/* 7.6.1 Data prefetch */
93#define __pld(addr) __pldx(0, 0, 0, addr)
94
95#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
96#define __pldx(access_kind, cache_level, retention_policy, addr) \
97 __builtin_arm_prefetch(addr, access_kind, 1)
98#else
99#define __pldx(access_kind, cache_level, retention_policy, addr) \
100 __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
101#define __pldx_range(access_kind, retention_policy, length, count, stride, \
102 reuse_distance, addr) \
103 __builtin_arm_range_prefetch_x(addr, access_kind, retention_policy, length, \
104 count, stride, reuse_distance)
105#define __pld_range(access_kind, retention_policy, metadata, addr) \
106 __builtin_arm_range_prefetch(addr, access_kind, retention_policy, metadata)
107#endif
108
109/* 7.6.2 Instruction prefetch */
110#define __pli(addr) __plix(0, 0, addr)
111
112#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
113#define __plix(cache_level, retention_policy, addr) \
114 __builtin_arm_prefetch(addr, 0, 0)
115#else
116#define __plix(cache_level, retention_policy, addr) \
117 __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
118#define __pldir(addr) __builtin_arm_prefetch_ir(addr)
119#endif
120
121/* 7.7 NOP */
122#if !defined(_MSC_VER) || (!defined(__aarch64__) && !defined(__arm64ec__))
123static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
124 __builtin_arm_nop();
125}
126#endif
127
128/* 8 DATA-PROCESSING INTRINSICS */
129/* 8.2 Miscellaneous data-processing intrinsics */
130/* ROR */
131static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
132__ror(uint32_t __x, uint32_t __y) {
133 __y %= 32;
134 if (__y == 0)
135 return __x;
136 return (__x >> __y) | (__x << (32 - __y));
137}
138
139static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
140__rorll(uint64_t __x, uint32_t __y) {
141 __y %= 64;
142 if (__y == 0)
143 return __x;
144 return (__x >> __y) | (__x << (64 - __y));
145}
146
147static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
148__rorl(unsigned long __x, uint32_t __y) {
149#if __SIZEOF_LONG__ == 4
150 return __ror(__x, __y);
151#else
152 return __rorll(__x, __y);
153#endif
154}
155
156
157/* CLZ */
158static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
159__clz(uint32_t __t) {
160 return __builtin_arm_clz(__t);
161}
162
163static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
164__clzl(unsigned long __t) {
165#if __SIZEOF_LONG__ == 4
166 return __builtin_arm_clz(__t);
167#else
168 return __builtin_arm_clz64(__t);
169#endif
170}
171
172static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
173__clzll(uint64_t __t) {
174 return __builtin_arm_clz64(__t);
175}
176
177/* CLS */
178static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
179__cls(uint32_t __t) {
180 return __builtin_arm_cls(__t);
181}
182
183static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
184__clsl(unsigned long __t) {
185#if __SIZEOF_LONG__ == 4
186 return __builtin_arm_cls(__t);
187#else
188 return __builtin_arm_cls64(__t);
189#endif
190}
191
192static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
193__clsll(uint64_t __t) {
194 return __builtin_arm_cls64(__t);
195}
196
197/* REV */
198static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
199__rev(uint32_t __t) {
200 return __builtin_bswap32(__t);
201}
202
203static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
204__revl(unsigned long __t) {
205#if __SIZEOF_LONG__ == 4
206 return __builtin_bswap32(__t);
207#else
208 return __builtin_bswap64(__t);
209#endif
210}
211
212static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
213__revll(uint64_t __t) {
214 return __builtin_bswap64(__t);
215}
216
217/* REV16 */
218static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
219__rev16(uint32_t __t) {
220 return __ror(__rev(__t), 16);
221}
222
223static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
224__rev16ll(uint64_t __t) {
225 return (((uint64_t)__rev16(__t >> 32)) << 32) | (uint64_t)__rev16((uint32_t)__t);
226}
227
228static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
229__rev16l(unsigned long __t) {
230#if __SIZEOF_LONG__ == 4
231 return __rev16(__t);
232#else
233 return __rev16ll(__t);
234#endif
235}
236
237/* REVSH */
238static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
239__revsh(int16_t __t) {
240 return (int16_t)__builtin_bswap16((uint16_t)__t);
241}
242
243/* RBIT */
244static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
245__rbit(uint32_t __t) {
246 return __builtin_arm_rbit(__t);
247}
248
249static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
250__rbitll(uint64_t __t) {
251#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
252 return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
253 __builtin_arm_rbit(__t >> 32);
254#else
255 return __builtin_arm_rbit64(__t);
256#endif
257}
258
259static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
260__rbitl(unsigned long __t) {
261#if __SIZEOF_LONG__ == 4
262 return __rbit(__t);
263#else
264 return __rbitll(__t);
265#endif
266}
267
268/* 8.3 16-bit multiplications */
269#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
270static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
271__smulbb(int32_t __a, int32_t __b) {
272 return __builtin_arm_smulbb(__a, __b);
273}
274static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
275__smulbt(int32_t __a, int32_t __b) {
276 return __builtin_arm_smulbt(__a, __b);
277}
278static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
279__smultb(int32_t __a, int32_t __b) {
280 return __builtin_arm_smultb(__a, __b);
281}
282static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
283__smultt(int32_t __a, int32_t __b) {
284 return __builtin_arm_smultt(__a, __b);
285}
286static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
287__smulwb(int32_t __a, int32_t __b) {
288 return __builtin_arm_smulwb(__a, __b);
289}
290static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
291__smulwt(int32_t __a, int32_t __b) {
292 return __builtin_arm_smulwt(__a, __b);
293}
294#endif
295
296/*
297 * 8.4 Saturating intrinsics
298 *
299 * FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag
300 * intrinsics are implemented and the flag is enabled.
301 */
302/* 8.4.1 Width-specified saturation intrinsics */
303#if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT
304#define __ssat(x, y) __builtin_arm_ssat(x, y)
305#define __usat(x, y) __builtin_arm_usat(x, y)
306#endif
307
308/* 8.4.2 Saturating addition and subtraction intrinsics */
309#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
310static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
311__qadd(int32_t __t, int32_t __v) {
312 return __builtin_arm_qadd(__t, __v);
313}
314
315static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
316__qsub(int32_t __t, int32_t __v) {
317 return __builtin_arm_qsub(__t, __v);
318}
319
320static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
321__qdbl(int32_t __t) {
322 return __builtin_arm_qadd(__t, __t);
323}
324#endif
325
326/* 8.4.3 Accumulating multiplications */
327#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
328static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
329__smlabb(int32_t __a, int32_t __b, int32_t __c) {
330 return __builtin_arm_smlabb(__a, __b, __c);
331}
332static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
333__smlabt(int32_t __a, int32_t __b, int32_t __c) {
334 return __builtin_arm_smlabt(__a, __b, __c);
335}
336static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
337__smlatb(int32_t __a, int32_t __b, int32_t __c) {
338 return __builtin_arm_smlatb(__a, __b, __c);
339}
340static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
341__smlatt(int32_t __a, int32_t __b, int32_t __c) {
342 return __builtin_arm_smlatt(__a, __b, __c);
343}
344static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
345__smlawb(int32_t __a, int32_t __b, int32_t __c) {
346 return __builtin_arm_smlawb(__a, __b, __c);
347}
348static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
349__smlawt(int32_t __a, int32_t __b, int32_t __c) {
350 return __builtin_arm_smlawt(__a, __b, __c);
351}
352#endif
353
354
355/* 8.5.4 Parallel 16-bit saturation */
356#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
357#define __ssat16(x, y) __builtin_arm_ssat16(x, y)
358#define __usat16(x, y) __builtin_arm_usat16(x, y)
359#endif
360
361/* 8.5.5 Packing and unpacking */
362#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
363typedef int32_t int8x4_t;
364typedef int32_t int16x2_t;
365typedef uint32_t uint8x4_t;
366typedef uint32_t uint16x2_t;
367
368static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
369__sxtab16(int16x2_t __a, int8x4_t __b) {
370 return __builtin_arm_sxtab16(__a, __b);
371}
372static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
373__sxtb16(int8x4_t __a) {
374 return __builtin_arm_sxtb16(__a);
375}
376static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
377__uxtab16(int16x2_t __a, int8x4_t __b) {
378 return __builtin_arm_uxtab16(__a, __b);
379}
380static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
381__uxtb16(int8x4_t __a) {
382 return __builtin_arm_uxtb16(__a);
383}
384#endif
385
386/* 8.5.6 Parallel selection */
387#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
388static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
389__sel(uint8x4_t __a, uint8x4_t __b) {
390 return __builtin_arm_sel(__a, __b);
391}
392#endif
393
394/* 8.5.7 Parallel 8-bit addition and subtraction */
395#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
396static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
397__qadd8(int8x4_t __a, int8x4_t __b) {
398 return __builtin_arm_qadd8(__a, __b);
399}
400static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
401__qsub8(int8x4_t __a, int8x4_t __b) {
402 return __builtin_arm_qsub8(__a, __b);
403}
404static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
405__sadd8(int8x4_t __a, int8x4_t __b) {
406 return __builtin_arm_sadd8(__a, __b);
407}
408static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
409__shadd8(int8x4_t __a, int8x4_t __b) {
410 return __builtin_arm_shadd8(__a, __b);
411}
412static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
413__shsub8(int8x4_t __a, int8x4_t __b) {
414 return __builtin_arm_shsub8(__a, __b);
415}
416static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
417__ssub8(int8x4_t __a, int8x4_t __b) {
418 return __builtin_arm_ssub8(__a, __b);
419}
420static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
421__uadd8(uint8x4_t __a, uint8x4_t __b) {
422 return __builtin_arm_uadd8(__a, __b);
423}
424static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
425__uhadd8(uint8x4_t __a, uint8x4_t __b) {
426 return __builtin_arm_uhadd8(__a, __b);
427}
428static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
429__uhsub8(uint8x4_t __a, uint8x4_t __b) {
430 return __builtin_arm_uhsub8(__a, __b);
431}
432static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
433__uqadd8(uint8x4_t __a, uint8x4_t __b) {
434 return __builtin_arm_uqadd8(__a, __b);
435}
436static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
437__uqsub8(uint8x4_t __a, uint8x4_t __b) {
438 return __builtin_arm_uqsub8(__a, __b);
439}
440static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
441__usub8(uint8x4_t __a, uint8x4_t __b) {
442 return __builtin_arm_usub8(__a, __b);
443}
444#endif
445
446/* 8.5.8 Sum of 8-bit absolute differences */
447#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
448static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
449__usad8(uint8x4_t __a, uint8x4_t __b) {
450 return __builtin_arm_usad8(__a, __b);
451}
452static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
453__usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
454 return __builtin_arm_usada8(__a, __b, __c);
455}
456#endif
457
458/* 8.5.9 Parallel 16-bit addition and subtraction */
459#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
460static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
461__qadd16(int16x2_t __a, int16x2_t __b) {
462 return __builtin_arm_qadd16(__a, __b);
463}
464static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
465__qasx(int16x2_t __a, int16x2_t __b) {
466 return __builtin_arm_qasx(__a, __b);
467}
468static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
469__qsax(int16x2_t __a, int16x2_t __b) {
470 return __builtin_arm_qsax(__a, __b);
471}
472static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
473__qsub16(int16x2_t __a, int16x2_t __b) {
474 return __builtin_arm_qsub16(__a, __b);
475}
476static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
477__sadd16(int16x2_t __a, int16x2_t __b) {
478 return __builtin_arm_sadd16(__a, __b);
479}
480static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
481__sasx(int16x2_t __a, int16x2_t __b) {
482 return __builtin_arm_sasx(__a, __b);
483}
484static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
485__shadd16(int16x2_t __a, int16x2_t __b) {
486 return __builtin_arm_shadd16(__a, __b);
487}
488static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
489__shasx(int16x2_t __a, int16x2_t __b) {
490 return __builtin_arm_shasx(__a, __b);
491}
492static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
493__shsax(int16x2_t __a, int16x2_t __b) {
494 return __builtin_arm_shsax(__a, __b);
495}
496static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
497__shsub16(int16x2_t __a, int16x2_t __b) {
498 return __builtin_arm_shsub16(__a, __b);
499}
500static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
501__ssax(int16x2_t __a, int16x2_t __b) {
502 return __builtin_arm_ssax(__a, __b);
503}
504static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
505__ssub16(int16x2_t __a, int16x2_t __b) {
506 return __builtin_arm_ssub16(__a, __b);
507}
508static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
509__uadd16(uint16x2_t __a, uint16x2_t __b) {
510 return __builtin_arm_uadd16(__a, __b);
511}
512static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
513__uasx(uint16x2_t __a, uint16x2_t __b) {
514 return __builtin_arm_uasx(__a, __b);
515}
516static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
517__uhadd16(uint16x2_t __a, uint16x2_t __b) {
518 return __builtin_arm_uhadd16(__a, __b);
519}
520static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
521__uhasx(uint16x2_t __a, uint16x2_t __b) {
522 return __builtin_arm_uhasx(__a, __b);
523}
524static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
525__uhsax(uint16x2_t __a, uint16x2_t __b) {
526 return __builtin_arm_uhsax(__a, __b);
527}
528static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
529__uhsub16(uint16x2_t __a, uint16x2_t __b) {
530 return __builtin_arm_uhsub16(__a, __b);
531}
532static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
533__uqadd16(uint16x2_t __a, uint16x2_t __b) {
534 return __builtin_arm_uqadd16(__a, __b);
535}
536static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
537__uqasx(uint16x2_t __a, uint16x2_t __b) {
538 return __builtin_arm_uqasx(__a, __b);
539}
540static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
541__uqsax(uint16x2_t __a, uint16x2_t __b) {
542 return __builtin_arm_uqsax(__a, __b);
543}
544static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
545__uqsub16(uint16x2_t __a, uint16x2_t __b) {
546 return __builtin_arm_uqsub16(__a, __b);
547}
548static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
549__usax(uint16x2_t __a, uint16x2_t __b) {
550 return __builtin_arm_usax(__a, __b);
551}
552static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
553__usub16(uint16x2_t __a, uint16x2_t __b) {
554 return __builtin_arm_usub16(__a, __b);
555}
556#endif
557
558/* 8.5.10 Parallel 16-bit multiplication */
559#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
560static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
561__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
562 return __builtin_arm_smlad(__a, __b, __c);
563}
564static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
565__smladx(int16x2_t __a, int16x2_t __b, int32_t __c) {
566 return __builtin_arm_smladx(__a, __b, __c);
567}
568static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
569__smlald(int16x2_t __a, int16x2_t __b, int64_t __c) {
570 return __builtin_arm_smlald(__a, __b, __c);
571}
572static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
573__smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
574 return __builtin_arm_smlaldx(__a, __b, __c);
575}
576static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
577__smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) {
578 return __builtin_arm_smlsd(__a, __b, __c);
579}
580static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
581__smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) {
582 return __builtin_arm_smlsdx(__a, __b, __c);
583}
584static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
585__smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) {
586 return __builtin_arm_smlsld(__a, __b, __c);
587}
588static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
589__smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
590 return __builtin_arm_smlsldx(__a, __b, __c);
591}
592static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
593__smuad(int16x2_t __a, int16x2_t __b) {
594 return __builtin_arm_smuad(__a, __b);
595}
596static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
597__smuadx(int16x2_t __a, int16x2_t __b) {
598 return __builtin_arm_smuadx(__a, __b);
599}
600static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
601__smusd(int16x2_t __a, int16x2_t __b) {
602 return __builtin_arm_smusd(__a, __b);
603}
604static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
605__smusdx(int16x2_t __a, int16x2_t __b) {
606 return __builtin_arm_smusdx(__a, __b);
607}
608#endif
609
610/* 8.6 Floating-point data-processing intrinsics */
611#if (defined(__ARM_FEATURE_DIRECTED_ROUNDING) && \
612 (__ARM_FEATURE_DIRECTED_ROUNDING)) && \
613 (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
614static __inline__ double __attribute__((__always_inline__, __nodebug__))
615__rintn(double __a) {
616 return __builtin_roundeven(__a);
617}
618
619static __inline__ float __attribute__((__always_inline__, __nodebug__))
620__rintnf(float __a) {
621 return __builtin_roundevenf(__a);
622}
623#endif
624
625/* 8.8 CRC32 intrinsics */
626static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
627__crc32b(uint32_t __a, uint8_t __b) {
628 return __builtin_arm_crc32b(__a, __b);
629}
630
631static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
632__crc32h(uint32_t __a, uint16_t __b) {
633 return __builtin_arm_crc32h(__a, __b);
634}
635
636static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
637__crc32w(uint32_t __a, uint32_t __b) {
638 return __builtin_arm_crc32w(__a, __b);
639}
640
641static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
642__crc32d(uint32_t __a, uint64_t __b) {
643 return __builtin_arm_crc32d(__a, __b);
644}
645
646static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
647__crc32cb(uint32_t __a, uint8_t __b) {
648 return __builtin_arm_crc32cb(__a, __b);
649}
650
651static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
652__crc32ch(uint32_t __a, uint16_t __b) {
653 return __builtin_arm_crc32ch(__a, __b);
654}
655
656static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
657__crc32cw(uint32_t __a, uint32_t __b) {
658 return __builtin_arm_crc32cw(__a, __b);
659}
660
661static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
662__crc32cd(uint32_t __a, uint64_t __b) {
663 return __builtin_arm_crc32cd(__a, __b);
664}
665
666/* 8.6 Floating-point data-processing intrinsics */
667/* Armv8.3-A Javascript conversion intrinsic */
668#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
669static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a")))
670__jcvt(double __a) {
671 return __builtin_arm_jcvt(__a);
672}
673#endif
674
675/* Armv8.5-A FP rounding intrinsics */
676#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
677static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
678__rint32zf(float __a) {
679 return __builtin_arm_rint32zf(__a);
680}
681
682static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
683__rint32z(double __a) {
684 return __builtin_arm_rint32z(__a);
685}
686
687static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
688__rint64zf(float __a) {
689 return __builtin_arm_rint64zf(__a);
690}
691
692static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
693__rint64z(double __a) {
694 return __builtin_arm_rint64z(__a);
695}
696
697static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
698__rint32xf(float __a) {
699 return __builtin_arm_rint32xf(__a);
700}
701
702static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
703__rint32x(double __a) {
704 return __builtin_arm_rint32x(__a);
705}
706
707static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
708__rint64xf(float __a) {
709 return __builtin_arm_rint64xf(__a);
710}
711
712static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
713__rint64x(double __a) {
714 return __builtin_arm_rint64x(__a);
715}
716#endif
717
718/* 8.9 Armv8.7-A load/store 64-byte intrinsics */
719#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
720typedef struct {
721 uint64_t val[8];
722} data512_t;
723
724static __inline__ data512_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
725__arm_ld64b(const void *__addr) {
726 data512_t __value;
727 __builtin_arm_ld64b(__addr, __value.val);
728 return __value;
729}
730static __inline__ void __attribute__((__always_inline__, __nodebug__, target("ls64")))
731__arm_st64b(void *__addr, data512_t __value) {
732 __builtin_arm_st64b(__addr, __value.val);
733}
734static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
735__arm_st64bv(void *__addr, data512_t __value) {
736 return __builtin_arm_st64bv(__addr, __value.val);
737}
738static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
739__arm_st64bv0(void *__addr, data512_t __value) {
740 return __builtin_arm_st64bv0(__addr, __value.val);
741}
742#endif
743
744/* 11.1 Special register intrinsics */
745#define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
746#define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
747#define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg)
748#define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
749#define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg))
750#define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg))
751#define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
752#define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
753#define __arm_wsr128(sysreg, v) __builtin_arm_wsr128(sysreg, v)
754#define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
755#define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
756#define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))
757
758/* 10.3 MTE intrinsics */
759#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
760#define __arm_mte_create_random_tag(__ptr, __mask) __builtin_arm_irg(__ptr, __mask)
761#define __arm_mte_increment_tag(__ptr, __tag_offset) __builtin_arm_addg(__ptr, __tag_offset)
762#define __arm_mte_exclude_tag(__ptr, __excluded) __builtin_arm_gmi(__ptr, __excluded)
763#define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr)
764#define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
765#define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
766
767/* 18 memcpy family of operations intrinsics - MOPS */
768#define __arm_mops_memset_tag(__tagged_address, __value, __size) \
769 __builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
770#endif
771
772/* 11.3 Coprocessor Intrinsics */
773#if defined(__ARM_FEATURE_COPROC)
774
775#if (__ARM_FEATURE_COPROC & 0x1)
776
777#if (__ARM_ARCH < 8)
778#define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) \
779 __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
780#endif /* __ARM_ARCH < 8 */
781
782#define __arm_ldc(coproc, CRd, p) __builtin_arm_ldc(coproc, CRd, p)
783#define __arm_stc(coproc, CRd, p) __builtin_arm_stc(coproc, CRd, p)
784
785#define __arm_mcr(coproc, opc1, value, CRn, CRm, opc2) \
786 __builtin_arm_mcr(coproc, opc1, value, CRn, CRm, opc2)
787#define __arm_mrc(coproc, opc1, CRn, CRm, opc2) \
788 __builtin_arm_mrc(coproc, opc1, CRn, CRm, opc2)
789
790#if (__ARM_ARCH != 4) && (__ARM_ARCH < 8)
791#define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
792#define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
793#endif /* (__ARM_ARCH != 4) && (__ARM_ARCH != 8) */
794
795#if (__ARM_ARCH_8M_MAIN__) || (__ARM_ARCH_8_1M_MAIN__)
796#define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) \
797 __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
798#define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
799#define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
800#endif /* ___ARM_ARCH_8M_MAIN__ */
801
802#endif /* __ARM_FEATURE_COPROC & 0x1 */
803
804#if (__ARM_FEATURE_COPROC & 0x2)
805#define __arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2) \
806 __builtin_arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2)
807#define __arm_ldc2(coproc, CRd, p) __builtin_arm_ldc2(coproc, CRd, p)
808#define __arm_stc2(coproc, CRd, p) __builtin_arm_stc2(coproc, CRd, p)
809#define __arm_ldc2l(coproc, CRd, p) __builtin_arm_ldc2l(coproc, CRd, p)
810#define __arm_stc2l(coproc, CRd, p) __builtin_arm_stc2l(coproc, CRd, p)
811#define __arm_mcr2(coproc, opc1, value, CRn, CRm, opc2) \
812 __builtin_arm_mcr2(coproc, opc1, value, CRn, CRm, opc2)
813#define __arm_mrc2(coproc, opc1, CRn, CRm, opc2) \
814 __builtin_arm_mrc2(coproc, opc1, CRn, CRm, opc2)
815#endif
816
817#if (__ARM_FEATURE_COPROC & 0x4)
818#define __arm_mcrr(coproc, opc1, value, CRm) \
819 __builtin_arm_mcrr(coproc, opc1, value, CRm)
820#define __arm_mrrc(coproc, opc1, CRm) __builtin_arm_mrrc(coproc, opc1, CRm)
821#endif
822
823#if (__ARM_FEATURE_COPROC & 0x8)
824#define __arm_mcrr2(coproc, opc1, value, CRm) \
825 __builtin_arm_mcrr2(coproc, opc1, value, CRm)
826#define __arm_mrrc2(coproc, opc1, CRm) __builtin_arm_mrrc2(coproc, opc1, CRm)
827#endif
828
829#endif // __ARM_FEATURE_COPROC
830
831/* 8.7 Armv8.5-A Random number generation intrinsics */
832#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
833static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
834__rndr(uint64_t *__p) {
835 return __builtin_arm_rndr(__p);
836}
837static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
838__rndrrs(uint64_t *__p) {
839 return __builtin_arm_rndrrs(__p);
840}
841#endif
842
843/* Atomic store with PCDPHINT */
844#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
845#define __arm_atomic_store_with_stshh(ptr, data, memory_order, \
846 retention_policy) \
847 __builtin_arm_atomic_store_with_stshh(ptr, data, memory_order, \
848 retention_policy)
849#endif
850
851/* 11.2 Guarded Control Stack intrinsics */
852#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
853static __inline__ void * __attribute__((__always_inline__, __nodebug__))
854__gcspr() {
855 return (void *)__builtin_arm_rsr64("gcspr_el0");
856}
857
858static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("gcs")))
859__gcspopm() {
860 return __builtin_arm_gcspopm(0);
861}
862
863static __inline__ void *__attribute__((__always_inline__, __nodebug__,
864 target("gcs")))
865__gcsss(void *__stack) {
866 return __builtin_arm_gcsss(__stack);
867}
868#endif
869
870#if defined(__cplusplus)
871}
872#endif
873
874#endif /* __ARM_ACLE_H */
__DEVICE__ int __clzll(long long __a)
__DEVICE__ int __clz(int __a)
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector float vector float vector float __c
Definition altivec.h:4800
static __inline__ vector float vector float __b
Definition altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57
void __yield(void)
void __wfi(void)
void __sev(void)
void __dmb(unsigned int)
void __sevl(void)
return __v
Definition arm_acle.h:88
void __isb(unsigned int)
__asm__("swp %0, %1, [%2]" :"=r"(__v) :"r"(__x), "r"(__p) :"memory")
static __inline__ uint32_t uint32_t __y
Definition arm_acle.h:132
void __dsb(unsigned int)
void __wfe(void)
static __inline__ void int __a
Definition emmintrin.h:4077
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32 __crc32d(unsigned int __C, unsigned int __D)
Adds the unsigned integer operand to the CRC-32C checksum of the second unsigned integer operand.
Definition ia32intrin.h:446
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32 __crc32b(unsigned int __C, unsigned char __D)
Adds the unsigned integer operand to the CRC-32C checksum of the unsigned char operand.
Definition ia32intrin.h:406
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32 __crc32w(unsigned int __C, unsigned short __D)
Adds the unsigned integer operand to the CRC-32C checksum of the unsigned short operand.
Definition ia32intrin.h:426
static __inline__ void unsigned int __value