clang 22.0.0git
fmaintrin.h
Go to the documentation of this file.
1/*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __FMAINTRIN_H
15#define __FMAINTRIN_H
16
17/* Define the default attributes for the functions in this file. */
18#define __DEFAULT_FN_ATTRS128 \
19 __attribute__((__always_inline__, __nodebug__, __target__("fma"), \
20 __min_vector_width__(128)))
21#define __DEFAULT_FN_ATTRS256 \
22 __attribute__((__always_inline__, __nodebug__, __target__("fma"), \
23 __min_vector_width__(256)))
24
25#if defined(__cplusplus) && (__cplusplus >= 201103L)
26#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
27#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
28#else
29#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
30#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
31#endif
32
33/// Computes a multiply-add of 128-bit vectors of [4 x float].
34/// For each element, computes <c> (__A * __B) + __C </c>.
35///
36/// \headerfile <immintrin.h>
37///
38/// This intrinsic corresponds to the \c VFMADD213PS instruction.
39///
40/// \param __A
41/// A 128-bit vector of [4 x float] containing the multiplicand.
42/// \param __B
43/// A 128-bit vector of [4 x float] containing the multiplier.
44/// \param __C
45/// A 128-bit vector of [4 x float] containing the addend.
46/// \returns A 128-bit vector of [4 x float] containing the result.
47static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
48_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
49{
50 return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
51 (__v4sf)__C);
52}
53
54/// Computes a multiply-add of 128-bit vectors of [2 x double].
55/// For each element, computes <c> (__A * __B) + __C </c>.
56///
57/// \headerfile <immintrin.h>
58///
59/// This intrinsic corresponds to the \c VFMADD213PD instruction.
60///
61/// \param __A
62/// A 128-bit vector of [2 x double] containing the multiplicand.
63/// \param __B
64/// A 128-bit vector of [2 x double] containing the multiplier.
65/// \param __C
66/// A 128-bit vector of [2 x double] containing the addend.
67/// \returns A 128-bit [2 x double] vector containing the result.
68static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
69_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
70{
71 return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
72 (__v2df)__C);
73}
74
75/// Computes a scalar multiply-add of the single-precision values in the
76/// low 32 bits of 128-bit vectors of [4 x float].
77///
78/// \code{.operation}
79/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
80/// result[127:32] = __A[127:32]
81/// \endcode
82///
83/// \headerfile <immintrin.h>
84///
85/// This intrinsic corresponds to the \c VFMADD213SS instruction.
86///
87/// \param __A
88/// A 128-bit vector of [4 x float] containing the multiplicand in the low
89/// 32 bits.
90/// \param __B
91/// A 128-bit vector of [4 x float] containing the multiplier in the low
92/// 32 bits.
93/// \param __C
94/// A 128-bit vector of [4 x float] containing the addend in the low
95/// 32 bits.
96/// \returns A 128-bit vector of [4 x float] containing the result in the low
97/// 32 bits and a copy of \a __A[127:32] in the upper 96 bits.
98static __inline__ __m128 __DEFAULT_FN_ATTRS128
99_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
100{
101 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
102}
103
104/// Computes a scalar multiply-add of the double-precision values in the
105/// low 64 bits of 128-bit vectors of [2 x double].
106///
107/// \code{.operation}
108/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
109/// result[127:64] = __A[127:64]
110/// \endcode
111///
112/// \headerfile <immintrin.h>
113///
114/// This intrinsic corresponds to the \c VFMADD213SD instruction.
115///
116/// \param __A
117/// A 128-bit vector of [2 x double] containing the multiplicand in the low
118/// 64 bits.
119/// \param __B
120/// A 128-bit vector of [2 x double] containing the multiplier in the low
121/// 64 bits.
122/// \param __C
123/// A 128-bit vector of [2 x double] containing the addend in the low
124/// 64 bits.
125/// \returns A 128-bit vector of [2 x double] containing the result in the low
126/// 64 bits and a copy of \a __A[127:64] in the upper 64 bits.
127static __inline__ __m128d __DEFAULT_FN_ATTRS128
128_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
129{
130 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
131}
132
133/// Computes a multiply-subtract of 128-bit vectors of [4 x float].
134/// For each element, computes <c> (__A * __B) - __C </c>.
135///
136/// \headerfile <immintrin.h>
137///
138/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
139///
140/// \param __A
141/// A 128-bit vector of [4 x float] containing the multiplicand.
142/// \param __B
143/// A 128-bit vector of [4 x float] containing the multiplier.
144/// \param __C
145/// A 128-bit vector of [4 x float] containing the subtrahend.
146/// \returns A 128-bit vector of [4 x float] containing the result.
147static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
148_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
149{
150 return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
151 -(__v4sf)__C);
152}
153
154/// Computes a multiply-subtract of 128-bit vectors of [2 x double].
155/// For each element, computes <c> (__A * __B) - __C </c>.
156///
157/// \headerfile <immintrin.h>
158///
159/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
160///
161/// \param __A
162/// A 128-bit vector of [2 x double] containing the multiplicand.
163/// \param __B
164/// A 128-bit vector of [2 x double] containing the multiplier.
165/// \param __C
166/// A 128-bit vector of [2 x double] containing the addend.
167/// \returns A 128-bit vector of [2 x double] containing the result.
168static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
169_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
170{
171 return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
172 -(__v2df)__C);
173}
174
175/// Computes a scalar multiply-subtract of the single-precision values in
176/// the low 32 bits of 128-bit vectors of [4 x float].
177///
178/// \code{.operation}
179/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
180/// result[127:32] = __A[127:32]
181/// \endcode
182///
183/// \headerfile <immintrin.h>
184///
185/// This intrinsic corresponds to the \c VFMSUB213SS instruction.
186///
187/// \param __A
188/// A 128-bit vector of [4 x float] containing the multiplicand in the low
189/// 32 bits.
190/// \param __B
191/// A 128-bit vector of [4 x float] containing the multiplier in the low
192/// 32 bits.
193/// \param __C
194/// A 128-bit vector of [4 x float] containing the subtrahend in the low
195/// 32 bits.
196/// \returns A 128-bit vector of [4 x float] containing the result in the low
197/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
198static __inline__ __m128 __DEFAULT_FN_ATTRS128
199_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
200{
201 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
202}
203
204/// Computes a scalar multiply-subtract of the double-precision values in
205/// the low 64 bits of 128-bit vectors of [2 x double].
206///
207/// \code{.operation}
208/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
209/// result[127:64] = __A[127:64]
210/// \endcode
211///
212/// \headerfile <immintrin.h>
213///
214/// This intrinsic corresponds to the \c VFMSUB213SD instruction.
215///
216/// \param __A
217/// A 128-bit vector of [2 x double] containing the multiplicand in the low
218/// 64 bits.
219/// \param __B
220/// A 128-bit vector of [2 x double] containing the multiplier in the low
221/// 64 bits.
222/// \param __C
223/// A 128-bit vector of [2 x double] containing the subtrahend in the low
224/// 64 bits.
225/// \returns A 128-bit vector of [2 x double] containing the result in the low
226/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
227static __inline__ __m128d __DEFAULT_FN_ATTRS128
228_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
229{
230 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
231}
232
233/// Computes a negated multiply-add of 128-bit vectors of [4 x float].
234/// For each element, computes <c> -(__A * __B) + __C </c>.
235///
236/// \headerfile <immintrin.h>
237///
238/// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
239///
240/// \param __A
241/// A 128-bit vector of [4 x float] containing the multiplicand.
242/// \param __B
243/// A 128-bit vector of [4 x float] containing the multiplier.
244/// \param __C
245/// A 128-bit vector of [4 x float] containing the addend.
246/// \returns A 128-bit [4 x float] vector containing the result.
247static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
248_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
249{
250 return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
251 (__v4sf)__C);
252}
253
254/// Computes a negated multiply-add of 128-bit vectors of [2 x double].
255/// For each element, computes <c> -(__A * __B) + __C </c>.
256///
257/// \headerfile <immintrin.h>
258///
259/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
260///
261/// \param __A
262/// A 128-bit vector of [2 x double] containing the multiplicand.
263/// \param __B
264/// A 128-bit vector of [2 x double] containing the multiplier.
265/// \param __C
266/// A 128-bit vector of [2 x double] containing the addend.
267/// \returns A 128-bit vector of [2 x double] containing the result.
268static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
269_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
270{
271 return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
272 (__v2df)__C);
273}
274
275/// Computes a scalar negated multiply-add of the single-precision values in
276/// the low 32 bits of 128-bit vectors of [4 x float].
277///
278/// \code{.operation}
279/// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
280/// result[127:32] = __A[127:32]
281/// \endcode
282///
283/// \headerfile <immintrin.h>
284///
285/// This intrinsic corresponds to the \c VFNMADD213SS instruction.
286///
287/// \param __A
288/// A 128-bit vector of [4 x float] containing the multiplicand in the low
289/// 32 bits.
290/// \param __B
291/// A 128-bit vector of [4 x float] containing the multiplier in the low
292/// 32 bits.
293/// \param __C
294/// A 128-bit vector of [4 x float] containing the addend in the low
295/// 32 bits.
296/// \returns A 128-bit vector of [4 x float] containing the result in the low
297/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
298static __inline__ __m128 __DEFAULT_FN_ATTRS128
299_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
300{
301 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
302}
303
304/// Computes a scalar negated multiply-add of the double-precision values
305/// in the low 64 bits of 128-bit vectors of [2 x double].
306///
307/// \code{.operation}
308/// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
309/// result[127:64] = __A[127:64]
310/// \endcode
311///
312/// \headerfile <immintrin.h>
313///
314/// This intrinsic corresponds to the \c VFNMADD213SD instruction.
315///
316/// \param __A
317/// A 128-bit vector of [2 x double] containing the multiplicand in the low
318/// 64 bits.
319/// \param __B
320/// A 128-bit vector of [2 x double] containing the multiplier in the low
321/// 64 bits.
322/// \param __C
323/// A 128-bit vector of [2 x double] containing the addend in the low
324/// 64 bits.
325/// \returns A 128-bit vector of [2 x double] containing the result in the low
326/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
327static __inline__ __m128d __DEFAULT_FN_ATTRS128
328_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
329{
330 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
331}
332
333/// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
334/// For each element, computes <c> -(__A * __B) - __C </c>.
335///
336/// \headerfile <immintrin.h>
337///
338/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
339///
340/// \param __A
341/// A 128-bit vector of [4 x float] containing the multiplicand.
342/// \param __B
343/// A 128-bit vector of [4 x float] containing the multiplier.
344/// \param __C
345/// A 128-bit vector of [4 x float] containing the subtrahend.
346/// \returns A 128-bit vector of [4 x float] containing the result.
347static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
348_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
349{
350 return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
351 -(__v4sf)__C);
352}
353
354/// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
355/// For each element, computes <c> -(__A * __B) - __C </c>.
356///
357/// \headerfile <immintrin.h>
358///
359/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
360///
361/// \param __A
362/// A 128-bit vector of [2 x double] containing the multiplicand.
363/// \param __B
364/// A 128-bit vector of [2 x double] containing the multiplier.
365/// \param __C
366/// A 128-bit vector of [2 x double] containing the subtrahend.
367/// \returns A 128-bit vector of [2 x double] containing the result.
368static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
369_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
370{
371 return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
372 -(__v2df)__C);
373}
374
375/// Computes a scalar negated multiply-subtract of the single-precision
376/// values in the low 32 bits of 128-bit vectors of [4 x float].
377///
378/// \code{.operation}
379/// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
380/// result[127:32] = __A[127:32]
381/// \endcode
382///
383/// \headerfile <immintrin.h>
384///
385/// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
386///
387/// \param __A
388/// A 128-bit vector of [4 x float] containing the multiplicand in the low
389/// 32 bits.
390/// \param __B
391/// A 128-bit vector of [4 x float] containing the multiplier in the low
392/// 32 bits.
393/// \param __C
394/// A 128-bit vector of [4 x float] containing the subtrahend in the low
395/// 32 bits.
396/// \returns A 128-bit vector of [4 x float] containing the result in the low
397/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
398static __inline__ __m128 __DEFAULT_FN_ATTRS128
399_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
400{
401 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
402}
403
404/// Computes a scalar negated multiply-subtract of the double-precision
405/// values in the low 64 bits of 128-bit vectors of [2 x double].
406///
407/// \code{.operation}
408/// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
409/// result[127:64] = __A[127:64]
410/// \endcode
411///
412/// \headerfile <immintrin.h>
413///
414/// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
415///
416/// \param __A
417/// A 128-bit vector of [2 x double] containing the multiplicand in the low
418/// 64 bits.
419/// \param __B
420/// A 128-bit vector of [2 x double] containing the multiplier in the low
421/// 64 bits.
422/// \param __C
423/// A 128-bit vector of [2 x double] containing the subtrahend in the low
424/// 64 bits.
425/// \returns A 128-bit vector of [2 x double] containing the result in the low
426/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
427static __inline__ __m128d __DEFAULT_FN_ATTRS128
428_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
429{
430 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
431}
432
433/// Computes a multiply with alternating add/subtract of 128-bit vectors of
434/// [4 x float].
435///
436/// \code{.operation}
437/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
438/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
439/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
440/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
441/// \endcode
442///
443/// \headerfile <immintrin.h>
444///
445/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
446///
447/// \param __A
448/// A 128-bit vector of [4 x float] containing the multiplicand.
449/// \param __B
450/// A 128-bit vector of [4 x float] containing the multiplier.
451/// \param __C
452/// A 128-bit vector of [4 x float] containing the addend/subtrahend.
453/// \returns A 128-bit vector of [4 x float] containing the result.
454static __inline__ __m128 __DEFAULT_FN_ATTRS128
455_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
456{
457 return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
458}
459
460/// Computes a multiply with alternating add/subtract of 128-bit vectors of
461/// [2 x double].
462///
463/// \code{.operation}
464/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
465/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
466/// \endcode
467///
468/// \headerfile <immintrin.h>
469///
470/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
471///
472/// \param __A
473/// A 128-bit vector of [2 x double] containing the multiplicand.
474/// \param __B
475/// A 128-bit vector of [2 x double] containing the multiplier.
476/// \param __C
477/// A 128-bit vector of [2 x double] containing the addend/subtrahend.
478/// \returns A 128-bit vector of [2 x double] containing the result.
479static __inline__ __m128d __DEFAULT_FN_ATTRS128
480_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
481{
482 return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
483}
484
485/// Computes a multiply with alternating add/subtract of 128-bit vectors of
486/// [4 x float].
487///
488/// \code{.operation}
489/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
490/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
491/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
492/// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
493/// \endcode
494///
495/// \headerfile <immintrin.h>
496///
497/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
498///
499/// \param __A
500/// A 128-bit vector of [4 x float] containing the multiplicand.
501/// \param __B
502/// A 128-bit vector of [4 x float] containing the multiplier.
503/// \param __C
504/// A 128-bit vector of [4 x float] containing the addend/subtrahend.
505/// \returns A 128-bit vector of [4 x float] containing the result.
506static __inline__ __m128 __DEFAULT_FN_ATTRS128
507_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
508{
509 return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
510}
511
512/// Computes a multiply with alternating add/subtract of 128-bit vectors of
513/// [2 x double].
514///
515/// \code{.operation}
516/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
517/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
518/// \endcode
519///
520/// \headerfile <immintrin.h>
521///
522/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
523///
524/// \param __A
525/// A 128-bit vector of [2 x double] containing the multiplicand.
526/// \param __B
527/// A 128-bit vector of [2 x double] containing the multiplier.
528/// \param __C
529/// A 128-bit vector of [2 x double] containing the addend/subtrahend.
530/// \returns A 128-bit vector of [2 x double] containing the result.
531static __inline__ __m128d __DEFAULT_FN_ATTRS128
532_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
533{
534 return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
535}
536
537/// Computes a multiply-add of 256-bit vectors of [8 x float].
538/// For each element, computes <c> (__A * __B) + __C </c>.
539///
540/// \headerfile <immintrin.h>
541///
542/// This intrinsic corresponds to the \c VFMADD213PS instruction.
543///
544/// \param __A
545/// A 256-bit vector of [8 x float] containing the multiplicand.
546/// \param __B
547/// A 256-bit vector of [8 x float] containing the multiplier.
548/// \param __C
549/// A 256-bit vector of [8 x float] containing the addend.
550/// \returns A 256-bit vector of [8 x float] containing the result.
551static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
552_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
553{
554 return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
555 (__v8sf)__C);
556}
557
558/// Computes a multiply-add of 256-bit vectors of [4 x double].
559/// For each element, computes <c> (__A * __B) + __C </c>.
560///
561/// \headerfile <immintrin.h>
562///
563/// This intrinsic corresponds to the \c VFMADD213PD instruction.
564///
565/// \param __A
566/// A 256-bit vector of [4 x double] containing the multiplicand.
567/// \param __B
568/// A 256-bit vector of [4 x double] containing the multiplier.
569/// \param __C
570/// A 256-bit vector of [4 x double] containing the addend.
571/// \returns A 256-bit vector of [4 x double] containing the result.
572static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
573_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
574{
575 return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
576 (__v4df)__C);
577}
578
579/// Computes a multiply-subtract of 256-bit vectors of [8 x float].
580/// For each element, computes <c> (__A * __B) - __C </c>.
581///
582/// \headerfile <immintrin.h>
583///
584/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
585///
586/// \param __A
587/// A 256-bit vector of [8 x float] containing the multiplicand.
588/// \param __B
589/// A 256-bit vector of [8 x float] containing the multiplier.
590/// \param __C
591/// A 256-bit vector of [8 x float] containing the subtrahend.
592/// \returns A 256-bit vector of [8 x float] containing the result.
593static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
594_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
595{
596 return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
597 -(__v8sf)__C);
598}
599
600/// Computes a multiply-subtract of 256-bit vectors of [4 x double].
601/// For each element, computes <c> (__A * __B) - __C </c>.
602///
603/// \headerfile <immintrin.h>
604///
605/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
606///
607/// \param __A
608/// A 256-bit vector of [4 x double] containing the multiplicand.
609/// \param __B
610/// A 256-bit vector of [4 x double] containing the multiplier.
611/// \param __C
612/// A 256-bit vector of [4 x double] containing the subtrahend.
613/// \returns A 256-bit vector of [4 x double] containing the result.
614static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
615_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
616{
617 return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
618 -(__v4df)__C);
619}
620
621/// Computes a negated multiply-add of 256-bit vectors of [8 x float].
622/// For each element, computes <c> -(__A * __B) + __C </c>.
623///
624/// \headerfile <immintrin.h>
625///
626/// This intrinsic corresponds to the \c VFNMADD213PS instruction.
627///
628/// \param __A
629/// A 256-bit vector of [8 x float] containing the multiplicand.
630/// \param __B
631/// A 256-bit vector of [8 x float] containing the multiplier.
632/// \param __C
633/// A 256-bit vector of [8 x float] containing the addend.
634/// \returns A 256-bit vector of [8 x float] containing the result.
635static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
636_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
637{
638 return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
639 (__v8sf)__C);
640}
641
642/// Computes a negated multiply-add of 256-bit vectors of [4 x double].
643/// For each element, computes <c> -(__A * __B) + __C </c>.
644///
645/// \headerfile <immintrin.h>
646///
647/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
648///
649/// \param __A
650/// A 256-bit vector of [4 x double] containing the multiplicand.
651/// \param __B
652/// A 256-bit vector of [4 x double] containing the multiplier.
653/// \param __C
654/// A 256-bit vector of [4 x double] containing the addend.
655/// \returns A 256-bit vector of [4 x double] containing the result.
656static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
657_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
658{
659 return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
660 (__v4df)__C);
661}
662
663/// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
664/// For each element, computes <c> -(__A * __B) - __C </c>.
665///
666/// \headerfile <immintrin.h>
667///
668/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
669///
670/// \param __A
671/// A 256-bit vector of [8 x float] containing the multiplicand.
672/// \param __B
673/// A 256-bit vector of [8 x float] containing the multiplier.
674/// \param __C
675/// A 256-bit vector of [8 x float] containing the subtrahend.
676/// \returns A 256-bit vector of [8 x float] containing the result.
677static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
678_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
679{
680 return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
681 -(__v8sf)__C);
682}
683
684/// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
685/// For each element, computes <c> -(__A * __B) - __C </c>.
686///
687/// \headerfile <immintrin.h>
688///
689/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
690///
691/// \param __A
692/// A 256-bit vector of [4 x double] containing the multiplicand.
693/// \param __B
694/// A 256-bit vector of [4 x double] containing the multiplier.
695/// \param __C
696/// A 256-bit vector of [4 x double] containing the subtrahend.
697/// \returns A 256-bit vector of [4 x double] containing the result.
698static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
699_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
700{
701 return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
702 -(__v4df)__C);
703}
704
705/// Computes a multiply with alternating add/subtract of 256-bit vectors of
706/// [8 x float].
707///
708/// \code{.operation}
709/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
710/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
711/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
712/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
713/// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
714/// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
715/// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
716/// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
717/// \endcode
718///
719/// \headerfile <immintrin.h>
720///
721/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
722///
723/// \param __A
724/// A 256-bit vector of [8 x float] containing the multiplicand.
725/// \param __B
726/// A 256-bit vector of [8 x float] containing the multiplier.
727/// \param __C
728/// A 256-bit vector of [8 x float] containing the addend/subtrahend.
729/// \returns A 256-bit vector of [8 x float] containing the result.
730static __inline__ __m256 __DEFAULT_FN_ATTRS256
731_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
732{
733 return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
734}
735
736/// Computes a multiply with alternating add/subtract of 256-bit vectors of
737/// [4 x double].
738///
739/// \code{.operation}
740/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
741/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
742/// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
743/// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
744/// \endcode
745///
746/// \headerfile <immintrin.h>
747///
748/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
749///
750/// \param __A
751/// A 256-bit vector of [4 x double] containing the multiplicand.
752/// \param __B
753/// A 256-bit vector of [4 x double] containing the multiplier.
754/// \param __C
755/// A 256-bit vector of [4 x double] containing the addend/subtrahend.
756/// \returns A 256-bit vector of [4 x double] containing the result.
757static __inline__ __m256d __DEFAULT_FN_ATTRS256
758_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
759{
760 return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
761}
762
763/// Computes a vector multiply with alternating add/subtract of 256-bit
764/// vectors of [8 x float].
765///
766/// \code{.operation}
767/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
768/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
769/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
770/// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
771/// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
772/// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
773/// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
774/// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
775/// \endcode
776///
777/// \headerfile <immintrin.h>
778///
779/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
780///
781/// \param __A
782/// A 256-bit vector of [8 x float] containing the multiplicand.
783/// \param __B
784/// A 256-bit vector of [8 x float] containing the multiplier.
785/// \param __C
786/// A 256-bit vector of [8 x float] containing the addend/subtrahend.
787/// \returns A 256-bit vector of [8 x float] containing the result.
788static __inline__ __m256 __DEFAULT_FN_ATTRS256
789_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
790{
791 return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
792}
793
794/// Computes a vector multiply with alternating add/subtract of 256-bit
795/// vectors of [4 x double].
796///
797/// \code{.operation}
798/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
799/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
800/// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
801/// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
802/// \endcode
803///
804/// \headerfile <immintrin.h>
805///
806/// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
807///
808/// \param __A
809/// A 256-bit vector of [4 x double] containing the multiplicand.
810/// \param __B
811/// A 256-bit vector of [4 x double] containing the multiplier.
812/// \param __C
813/// A 256-bit vector of [4 x double] containing the addend/subtrahend.
814/// \returns A 256-bit vector of [4 x double] containing the result.
815static __inline__ __m256d __DEFAULT_FN_ATTRS256
816_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
817{
818 return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
819}
820
821#undef __DEFAULT_FN_ATTRS128
822#undef __DEFAULT_FN_ATTRS256
823#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
824#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
825
826#endif /* __FMAINTRIN_H */
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
Definition avx2intrin.h:30
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
Definition avx2intrin.h:29
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a negated multiply-add of 128-bit vectors of [4 x float].
Definition fmaintrin.h:248
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a vector multiply with alternating add/subtract of 256-bit vectors of [8 x float].
Definition fmaintrin.h:789
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a multiply-add of 128-bit vectors of [4 x float].
Definition fmaintrin.h:48
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a negated multiply-add of 128-bit vectors of [2 x double].
Definition fmaintrin.h:269
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a multiply with alternating add/subtract of 256-bit vectors of [8 x float].
Definition fmaintrin.h:731
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a multiply with alternating add/subtract of 128-bit vectors of [4 x float].
Definition fmaintrin.h:507
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a multiply with alternating add/subtract of 128-bit vectors of [4 x float].
Definition fmaintrin.h:455
static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a multiply with alternating add/subtract of 256-bit vectors of [4 x double].
Definition fmaintrin.h:758
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a multiply-subtract of 256-bit vectors of [4 x double].
Definition fmaintrin.h:615
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a multiply-subtract of 256-bit vectors of [8 x float].
Definition fmaintrin.h:594
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a multiply with alternating add/subtract of 128-bit vectors of [2 x double].
Definition fmaintrin.h:532
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
Computes a scalar multiply-subtract of the single-precision values in the low 32 bits of 128-bit vect...
Definition fmaintrin.h:199
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
Computes a scalar multiply-subtract of the double-precision values in the low 64 bits of 128-bit vect...
Definition fmaintrin.h:228
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
Computes a scalar negated multiply-subtract of the single-precision values in the low 32 bits of 128-...
Definition fmaintrin.h:399
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
Computes a scalar multiply-add of the double-precision values in the low 64 bits of 128-bit vectors o...
Definition fmaintrin.h:128
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a multiply with alternating add/subtract of 128-bit vectors of [2 x double].
Definition fmaintrin.h:480
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a multiply-subtract of 128-bit vectors of [4 x float].
Definition fmaintrin.h:148
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a negated multiply-add of 256-bit vectors of [4 x double].
Definition fmaintrin.h:657
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a multiply-add of 256-bit vectors of [8 x float].
Definition fmaintrin.h:552
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
Definition fmaintrin.h:699
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
Definition fmaintrin.h:678
static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a vector multiply with alternating add/subtract of 256-bit vectors of [4 x double].
Definition fmaintrin.h:816
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
Computes a scalar negated multiply-add of the double-precision values in the low 64 bits of 128-bit v...
Definition fmaintrin.h:328
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a multiply-add of 128-bit vectors of [2 x double].
Definition fmaintrin.h:69
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a multiply-add of 256-bit vectors of [4 x double].
Definition fmaintrin.h:573
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
Computes a scalar negated multiply-subtract of the double-precision values in the low 64 bits of 128-...
Definition fmaintrin.h:428
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
Definition fmaintrin.h:348
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a negated multiply-add of 256-bit vectors of [8 x float].
Definition fmaintrin.h:636
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a multiply-subtract of 128-bit vectors of [2 x double].
Definition fmaintrin.h:169
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
Definition fmaintrin.h:369
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
Computes a scalar negated multiply-add of the single-precision values in the low 32 bits of 128-bit v...
Definition fmaintrin.h:299
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
Computes a scalar multiply-add of the single-precision values in the low 32 bits of 128-bit vectors o...
Definition fmaintrin.h:99