core/stdarch/crates/core_arch/src/x86/
fma.rs

1//! Fused Multiply-Add instruction set (FMA)
2//!
3//! The FMA instruction set is an extension to the 128 and 256-bit SSE
4//! instructions in the x86 microprocessor instruction set to perform fused
5//! multiply–add (FMA) operations.
6//!
7//! The references are:
8//!
9//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
10//!   Instruction Set Reference, A-Z][intel64_ref].
11//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
12//!   System Instructions][amd64_ref].
13//!
14//! Wikipedia's [FMA][wiki_fma] page provides a quick overview of the
15//! instructions available.
16//!
17//! [intel64_ref]: https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
18//! [amd64_ref]: https://docs.amd.com/v/u/en-US/24594_3.37
19//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21use crate::core_arch::x86::*;
22use crate::intrinsics::simd::{simd_fma, simd_neg};
23use crate::intrinsics::{fmaf32, fmaf64};
24
25#[cfg(test)]
26use stdarch_test::assert_instr;
27
28/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
29/// and `b`, and add the intermediate result to packed elements in `c`.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_pd)
32#[inline]
33#[target_feature(enable = "fma")]
34#[cfg_attr(test, assert_instr(vfmadd))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
37pub const fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
38    unsafe { simd_fma(a, b, c) }
39}
40
41/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
42/// and `b`, and add the intermediate result to packed elements in `c`.
43///
44/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_pd)
45#[inline]
46#[target_feature(enable = "fma")]
47#[cfg_attr(test, assert_instr(vfmadd))]
48#[stable(feature = "simd_x86", since = "1.27.0")]
49#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
50pub const fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
51    unsafe { simd_fma(a, b, c) }
52}
53
54/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
55/// and `b`, and add the intermediate result to packed elements in `c`.
56///
57/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ps)
58#[inline]
59#[target_feature(enable = "fma")]
60#[cfg_attr(test, assert_instr(vfmadd))]
61#[stable(feature = "simd_x86", since = "1.27.0")]
62#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
63pub const fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
64    unsafe { simd_fma(a, b, c) }
65}
66
67/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
68/// and `b`, and add the intermediate result to packed elements in `c`.
69///
70/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_ps)
71#[inline]
72#[target_feature(enable = "fma")]
73#[cfg_attr(test, assert_instr(vfmadd))]
74#[stable(feature = "simd_x86", since = "1.27.0")]
75#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
76pub const fn _mm256_fmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
77    unsafe { simd_fma(a, b, c) }
78}
79
80/// Multiplies the lower double-precision (64-bit) floating-point elements in
81/// `a` and `b`, and add the intermediate result to the lower element in `c`.
82/// Stores the result in the lower element of the returned value, and copy the
83/// upper element from `a` to the upper elements of the result.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_sd)
86#[inline]
87#[target_feature(enable = "fma")]
88#[cfg_attr(test, assert_instr(vfmadd))]
89#[stable(feature = "simd_x86", since = "1.27.0")]
90#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
91pub const fn _mm_fmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
92    unsafe {
93        simd_insert!(
94            a,
95            0,
96            fmaf64(_mm_cvtsd_f64(a), _mm_cvtsd_f64(b), _mm_cvtsd_f64(c))
97        )
98    }
99}
100
101/// Multiplies the lower single-precision (32-bit) floating-point elements in
102/// `a` and `b`, and add the intermediate result to the lower element in `c`.
103/// Stores the result in the lower element of the returned value, and copy the
104/// 3 upper elements from `a` to the upper elements of the result.
105///
106/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ss)
107#[inline]
108#[target_feature(enable = "fma")]
109#[cfg_attr(test, assert_instr(vfmadd))]
110#[stable(feature = "simd_x86", since = "1.27.0")]
111#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
112pub const fn _mm_fmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
113    unsafe {
114        simd_insert!(
115            a,
116            0,
117            fmaf32(_mm_cvtss_f32(a), _mm_cvtss_f32(b), _mm_cvtss_f32(c))
118        )
119    }
120}
121
122/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
123/// and `b`, and alternatively add and subtract packed elements in `c` to/from
124/// the intermediate result.
125///
126/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_pd)
127#[inline]
128#[target_feature(enable = "fma")]
129#[cfg_attr(test, assert_instr(vfmaddsub))]
130#[stable(feature = "simd_x86", since = "1.27.0")]
131#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
132pub const fn _mm_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
133    unsafe {
134        let add = simd_fma(a, b, c);
135        let sub = simd_fma(a, b, simd_neg(c));
136        simd_shuffle!(add, sub, [2, 1])
137    }
138}
139
140/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
141/// and `b`, and alternatively add and subtract packed elements in `c` to/from
142/// the intermediate result.
143///
144/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_pd)
145#[inline]
146#[target_feature(enable = "fma")]
147#[cfg_attr(test, assert_instr(vfmaddsub))]
148#[stable(feature = "simd_x86", since = "1.27.0")]
149#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
150pub const fn _mm256_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
151    unsafe {
152        let add = simd_fma(a, b, c);
153        let sub = simd_fma(a, b, simd_neg(c));
154        simd_shuffle!(add, sub, [4, 1, 6, 3])
155    }
156}
157
158/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
159/// and `b`, and alternatively add and subtract packed elements in `c` to/from
160/// the intermediate result.
161///
162/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_ps)
163#[inline]
164#[target_feature(enable = "fma")]
165#[cfg_attr(test, assert_instr(vfmaddsub))]
166#[stable(feature = "simd_x86", since = "1.27.0")]
167#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
168pub const fn _mm_fmaddsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
169    unsafe {
170        let add = simd_fma(a, b, c);
171        let sub = simd_fma(a, b, simd_neg(c));
172        simd_shuffle!(add, sub, [4, 1, 6, 3])
173    }
174}
175
176/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
177/// and `b`, and alternatively add and subtract packed elements in `c` to/from
178/// the intermediate result.
179///
180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_ps)
181#[inline]
182#[target_feature(enable = "fma")]
183#[cfg_attr(test, assert_instr(vfmaddsub))]
184#[stable(feature = "simd_x86", since = "1.27.0")]
185#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
186pub const fn _mm256_fmaddsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
187    unsafe {
188        let add = simd_fma(a, b, c);
189        let sub = simd_fma(a, b, simd_neg(c));
190        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
191    }
192}
193
194/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
195/// and `b`, and subtract packed elements in `c` from the intermediate result.
196///
197/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_pd)
198#[inline]
199#[target_feature(enable = "fma")]
200#[cfg_attr(test, assert_instr(vfmsub))]
201#[stable(feature = "simd_x86", since = "1.27.0")]
202#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
203pub const fn _mm_fmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
204    unsafe { simd_fma(a, b, simd_neg(c)) }
205}
206
207/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
208/// and `b`, and subtract packed elements in `c` from the intermediate result.
209///
210/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_pd)
211#[inline]
212#[target_feature(enable = "fma")]
213#[cfg_attr(test, assert_instr(vfmsub))]
214#[stable(feature = "simd_x86", since = "1.27.0")]
215#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
216pub const fn _mm256_fmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
217    unsafe { simd_fma(a, b, simd_neg(c)) }
218}
219
220/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
221/// and `b`, and subtract packed elements in `c` from the intermediate result.
222///
223/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ps)
224#[inline]
225#[target_feature(enable = "fma")]
226#[cfg_attr(test, assert_instr(vfmsub213ps))]
227#[stable(feature = "simd_x86", since = "1.27.0")]
228#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
229pub const fn _mm_fmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
230    unsafe { simd_fma(a, b, simd_neg(c)) }
231}
232
233/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
234/// and `b`, and subtract packed elements in `c` from the intermediate result.
235///
236/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_ps)
237#[inline]
238#[target_feature(enable = "fma")]
239#[cfg_attr(test, assert_instr(vfmsub213ps))]
240#[stable(feature = "simd_x86", since = "1.27.0")]
241#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
242pub const fn _mm256_fmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
243    unsafe { simd_fma(a, b, simd_neg(c)) }
244}
245
246/// Multiplies the lower double-precision (64-bit) floating-point elements in
247/// `a` and `b`, and subtract the lower element in `c` from the intermediate
248/// result. Store the result in the lower element of the returned value, and
249/// copy the upper element from `a` to the upper elements of the result.
250///
251/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_sd)
252#[inline]
253#[target_feature(enable = "fma")]
254#[cfg_attr(test, assert_instr(vfmsub))]
255#[stable(feature = "simd_x86", since = "1.27.0")]
256#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
257pub const fn _mm_fmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
258    unsafe {
259        simd_insert!(
260            a,
261            0,
262            fmaf64(_mm_cvtsd_f64(a), _mm_cvtsd_f64(b), -_mm_cvtsd_f64(c))
263        )
264    }
265}
266
267/// Multiplies the lower single-precision (32-bit) floating-point elements in
268/// `a` and `b`,  and subtract the lower element in `c` from the intermediate
269/// result. Store the result in the lower element of the returned value, and
270/// copy the 3 upper elements from `a` to the upper elements of the result.
271///
272/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ss)
273#[inline]
274#[target_feature(enable = "fma")]
275#[cfg_attr(test, assert_instr(vfmsub))]
276#[stable(feature = "simd_x86", since = "1.27.0")]
277#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
278pub const fn _mm_fmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
279    unsafe {
280        simd_insert!(
281            a,
282            0,
283            fmaf32(_mm_cvtss_f32(a), _mm_cvtss_f32(b), -_mm_cvtss_f32(c))
284        )
285    }
286}
287
288/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
289/// and `b`, and alternatively subtract and add packed elements in `c` from/to
290/// the intermediate result.
291///
292/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_pd)
293#[inline]
294#[target_feature(enable = "fma")]
295#[cfg_attr(test, assert_instr(vfmsubadd))]
296#[stable(feature = "simd_x86", since = "1.27.0")]
297#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
298pub const fn _mm_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
299    unsafe {
300        let add = simd_fma(a, b, c);
301        let sub = simd_fma(a, b, simd_neg(c));
302        simd_shuffle!(add, sub, [0, 3])
303    }
304}
305
306/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
307/// and `b`, and alternatively subtract and add packed elements in `c` from/to
308/// the intermediate result.
309///
310/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_pd)
311#[inline]
312#[target_feature(enable = "fma")]
313#[cfg_attr(test, assert_instr(vfmsubadd))]
314#[stable(feature = "simd_x86", since = "1.27.0")]
315#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
316pub const fn _mm256_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
317    unsafe {
318        let add = simd_fma(a, b, c);
319        let sub = simd_fma(a, b, simd_neg(c));
320        simd_shuffle!(add, sub, [0, 5, 2, 7])
321    }
322}
323
324/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
325/// and `b`, and alternatively subtract and add packed elements in `c` from/to
326/// the intermediate result.
327///
328/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_ps)
329#[inline]
330#[target_feature(enable = "fma")]
331#[cfg_attr(test, assert_instr(vfmsubadd))]
332#[stable(feature = "simd_x86", since = "1.27.0")]
333#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
334pub const fn _mm_fmsubadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
335    unsafe {
336        let add = simd_fma(a, b, c);
337        let sub = simd_fma(a, b, simd_neg(c));
338        simd_shuffle!(add, sub, [0, 5, 2, 7])
339    }
340}
341
342/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
343/// and `b`, and alternatively subtract and add packed elements in `c` from/to
344/// the intermediate result.
345///
346/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_ps)
347#[inline]
348#[target_feature(enable = "fma")]
349#[cfg_attr(test, assert_instr(vfmsubadd))]
350#[stable(feature = "simd_x86", since = "1.27.0")]
351#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
352pub const fn _mm256_fmsubadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
353    unsafe {
354        let add = simd_fma(a, b, c);
355        let sub = simd_fma(a, b, simd_neg(c));
356        simd_shuffle!(add, sub, [0, 9, 2, 11, 4, 13, 6, 15])
357    }
358}
359
360/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
361/// and `b`, and add the negated intermediate result to packed elements in `c`.
362///
363/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_pd)
364#[inline]
365#[target_feature(enable = "fma")]
366#[cfg_attr(test, assert_instr(vfnmadd))]
367#[stable(feature = "simd_x86", since = "1.27.0")]
368#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
369pub const fn _mm_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
370    unsafe { simd_fma(simd_neg(a), b, c) }
371}
372
373/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
374/// and `b`, and add the negated intermediate result to packed elements in `c`.
375///
376/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_pd)
377#[inline]
378#[target_feature(enable = "fma")]
379#[cfg_attr(test, assert_instr(vfnmadd))]
380#[stable(feature = "simd_x86", since = "1.27.0")]
381#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
382pub const fn _mm256_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
383    unsafe { simd_fma(simd_neg(a), b, c) }
384}
385
386/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
387/// and `b`, and add the negated intermediate result to packed elements in `c`.
388///
389/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ps)
390#[inline]
391#[target_feature(enable = "fma")]
392#[cfg_attr(test, assert_instr(vfnmadd))]
393#[stable(feature = "simd_x86", since = "1.27.0")]
394#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
395pub const fn _mm_fnmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
396    unsafe { simd_fma(simd_neg(a), b, c) }
397}
398
399/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
400/// and `b`, and add the negated intermediate result to packed elements in `c`.
401///
402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_ps)
403#[inline]
404#[target_feature(enable = "fma")]
405#[cfg_attr(test, assert_instr(vfnmadd))]
406#[stable(feature = "simd_x86", since = "1.27.0")]
407#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
408pub const fn _mm256_fnmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
409    unsafe { simd_fma(simd_neg(a), b, c) }
410}
411
412/// Multiplies the lower double-precision (64-bit) floating-point elements in
413/// `a` and `b`, and add the negated intermediate result to the lower element
414/// in `c`. Store the result in the lower element of the returned value, and
415/// copy the upper element from `a` to the upper elements of the result.
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_sd)
418#[inline]
419#[target_feature(enable = "fma")]
420#[cfg_attr(test, assert_instr(vfnmadd))]
421#[stable(feature = "simd_x86", since = "1.27.0")]
422#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
423pub const fn _mm_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
424    unsafe {
425        simd_insert!(
426            a,
427            0,
428            fmaf64(_mm_cvtsd_f64(a), -_mm_cvtsd_f64(b), _mm_cvtsd_f64(c))
429        )
430    }
431}
432
433/// Multiplies the lower single-precision (32-bit) floating-point elements in
434/// `a` and `b`, and add the negated intermediate result to the lower element
435/// in `c`. Store the result in the lower element of the returned value, and
436/// copy the 3 upper elements from `a` to the upper elements of the result.
437///
438/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ss)
439#[inline]
440#[target_feature(enable = "fma")]
441#[cfg_attr(test, assert_instr(vfnmadd))]
442#[stable(feature = "simd_x86", since = "1.27.0")]
443#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
444pub const fn _mm_fnmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
445    unsafe {
446        simd_insert!(
447            a,
448            0,
449            fmaf32(_mm_cvtss_f32(a), -_mm_cvtss_f32(b), _mm_cvtss_f32(c))
450        )
451    }
452}
453
454/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
455/// and `b`, and subtract packed elements in `c` from the negated intermediate
456/// result.
457///
458/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_pd)
459#[inline]
460#[target_feature(enable = "fma")]
461#[cfg_attr(test, assert_instr(vfnmsub))]
462#[stable(feature = "simd_x86", since = "1.27.0")]
463#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
464pub const fn _mm_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
465    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
466}
467
468/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
469/// and `b`, and subtract packed elements in `c` from the negated intermediate
470/// result.
471///
472/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_pd)
473#[inline]
474#[target_feature(enable = "fma")]
475#[cfg_attr(test, assert_instr(vfnmsub))]
476#[stable(feature = "simd_x86", since = "1.27.0")]
477#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
478pub const fn _mm256_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
479    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
480}
481
482/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
483/// and `b`, and subtract packed elements in `c` from the negated intermediate
484/// result.
485///
486/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ps)
487#[inline]
488#[target_feature(enable = "fma")]
489#[cfg_attr(test, assert_instr(vfnmsub))]
490#[stable(feature = "simd_x86", since = "1.27.0")]
491#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
492pub const fn _mm_fnmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
493    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
494}
495
496/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
497/// and `b`, and subtract packed elements in `c` from the negated intermediate
498/// result.
499///
500/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_ps)
501#[inline]
502#[target_feature(enable = "fma")]
503#[cfg_attr(test, assert_instr(vfnmsub))]
504#[stable(feature = "simd_x86", since = "1.27.0")]
505#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
506pub const fn _mm256_fnmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
507    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
508}
509
510/// Multiplies the lower double-precision (64-bit) floating-point elements in
511/// `a` and `b`, and subtract packed elements in `c` from the negated
512/// intermediate result. Store the result in the lower element of the returned
513/// value, and copy the upper element from `a` to the upper elements of the
514/// result.
515///
516/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_sd)
517#[inline]
518#[target_feature(enable = "fma")]
519#[cfg_attr(test, assert_instr(vfnmsub))]
520#[stable(feature = "simd_x86", since = "1.27.0")]
521#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
522pub const fn _mm_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
523    unsafe {
524        simd_insert!(
525            a,
526            0,
527            fmaf64(_mm_cvtsd_f64(a), -_mm_cvtsd_f64(b), -_mm_cvtsd_f64(c))
528        )
529    }
530}
531
532/// Multiplies the lower single-precision (32-bit) floating-point elements in
533/// `a` and `b`, and subtract packed elements in `c` from the negated
534/// intermediate result. Store the result in the lower element of the
535/// returned value, and copy the 3 upper elements from `a` to the upper
536/// elements of the result.
537///
538/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ss)
539#[inline]
540#[target_feature(enable = "fma")]
541#[cfg_attr(test, assert_instr(vfnmsub))]
542#[stable(feature = "simd_x86", since = "1.27.0")]
543#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
544pub const fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
545    unsafe {
546        simd_insert!(
547            a,
548            0,
549            fmaf32(_mm_cvtss_f32(a), -_mm_cvtss_f32(b), -_mm_cvtss_f32(c))
550        )
551    }
552}
553
554#[cfg(test)]
555mod tests {
556    use crate::core_arch::assert_eq_const as assert_eq;
557
558    use stdarch_test::simd_test;
559
560    use crate::core_arch::x86::*;
561
562    #[simd_test(enable = "fma")]
563    const fn test_mm_fmadd_pd() {
564        let a = _mm_setr_pd(1., 2.);
565        let b = _mm_setr_pd(5., 3.);
566        let c = _mm_setr_pd(4., 9.);
567        let r = _mm_setr_pd(9., 15.);
568        assert_eq_m128d(_mm_fmadd_pd(a, b, c), r);
569    }
570
571    #[simd_test(enable = "fma")]
572    const fn test_mm256_fmadd_pd() {
573        let a = _mm256_setr_pd(1., 2., 3., 4.);
574        let b = _mm256_setr_pd(5., 3., 7., 2.);
575        let c = _mm256_setr_pd(4., 9., 1., 7.);
576        let r = _mm256_setr_pd(9., 15., 22., 15.);
577        assert_eq_m256d(_mm256_fmadd_pd(a, b, c), r);
578    }
579
580    #[simd_test(enable = "fma")]
581    const fn test_mm_fmadd_ps() {
582        let a = _mm_setr_ps(1., 2., 3., 4.);
583        let b = _mm_setr_ps(5., 3., 7., 2.);
584        let c = _mm_setr_ps(4., 9., 1., 7.);
585        let r = _mm_setr_ps(9., 15., 22., 15.);
586        assert_eq_m128(_mm_fmadd_ps(a, b, c), r);
587    }
588
589    #[simd_test(enable = "fma")]
590    const fn test_mm256_fmadd_ps() {
591        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
592        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
593        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
594        let r = _mm256_setr_ps(9., 15., 22., 15., -5., -49., -2., -31.);
595        assert_eq_m256(_mm256_fmadd_ps(a, b, c), r);
596    }
597
598    #[simd_test(enable = "fma")]
599    const fn test_mm_fmadd_sd() {
600        let a = _mm_setr_pd(1., 2.);
601        let b = _mm_setr_pd(5., 3.);
602        let c = _mm_setr_pd(4., 9.);
603        let r = _mm_setr_pd(9., 2.);
604        assert_eq_m128d(_mm_fmadd_sd(a, b, c), r);
605    }
606
607    #[simd_test(enable = "fma")]
608    const fn test_mm_fmadd_ss() {
609        let a = _mm_setr_ps(1., 2., 3., 4.);
610        let b = _mm_setr_ps(5., 3., 7., 2.);
611        let c = _mm_setr_ps(4., 9., 1., 7.);
612        let r = _mm_setr_ps(9., 2., 3., 4.);
613        assert_eq_m128(_mm_fmadd_ss(a, b, c), r);
614    }
615
616    #[simd_test(enable = "fma")]
617    const fn test_mm_fmaddsub_pd() {
618        let a = _mm_setr_pd(1., 2.);
619        let b = _mm_setr_pd(5., 3.);
620        let c = _mm_setr_pd(4., 9.);
621        let r = _mm_setr_pd(1., 15.);
622        assert_eq_m128d(_mm_fmaddsub_pd(a, b, c), r);
623    }
624
625    #[simd_test(enable = "fma")]
626    const fn test_mm256_fmaddsub_pd() {
627        let a = _mm256_setr_pd(1., 2., 3., 4.);
628        let b = _mm256_setr_pd(5., 3., 7., 2.);
629        let c = _mm256_setr_pd(4., 9., 1., 7.);
630        let r = _mm256_setr_pd(1., 15., 20., 15.);
631        assert_eq_m256d(_mm256_fmaddsub_pd(a, b, c), r);
632    }
633
634    #[simd_test(enable = "fma")]
635    const fn test_mm_fmaddsub_ps() {
636        let a = _mm_setr_ps(1., 2., 3., 4.);
637        let b = _mm_setr_ps(5., 3., 7., 2.);
638        let c = _mm_setr_ps(4., 9., 1., 7.);
639        let r = _mm_setr_ps(1., 15., 20., 15.);
640        assert_eq_m128(_mm_fmaddsub_ps(a, b, c), r);
641    }
642
643    #[simd_test(enable = "fma")]
644    const fn test_mm256_fmaddsub_ps() {
645        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
646        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
647        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
648        let r = _mm256_setr_ps(1., 15., 20., 15., 5., -49., 2., -31.);
649        assert_eq_m256(_mm256_fmaddsub_ps(a, b, c), r);
650    }
651
652    #[simd_test(enable = "fma")]
653    const fn test_mm_fmsub_pd() {
654        let a = _mm_setr_pd(1., 2.);
655        let b = _mm_setr_pd(5., 3.);
656        let c = _mm_setr_pd(4., 9.);
657        let r = _mm_setr_pd(1., -3.);
658        assert_eq_m128d(_mm_fmsub_pd(a, b, c), r);
659    }
660
661    #[simd_test(enable = "fma")]
662    const fn test_mm256_fmsub_pd() {
663        let a = _mm256_setr_pd(1., 2., 3., 4.);
664        let b = _mm256_setr_pd(5., 3., 7., 2.);
665        let c = _mm256_setr_pd(4., 9., 1., 7.);
666        let r = _mm256_setr_pd(1., -3., 20., 1.);
667        assert_eq_m256d(_mm256_fmsub_pd(a, b, c), r);
668    }
669
670    #[simd_test(enable = "fma")]
671    const fn test_mm_fmsub_ps() {
672        let a = _mm_setr_ps(1., 2., 3., 4.);
673        let b = _mm_setr_ps(5., 3., 7., 2.);
674        let c = _mm_setr_ps(4., 9., 1., 7.);
675        let r = _mm_setr_ps(1., -3., 20., 1.);
676        assert_eq_m128(_mm_fmsub_ps(a, b, c), r);
677    }
678
679    #[simd_test(enable = "fma")]
680    const fn test_mm256_fmsub_ps() {
681        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
682        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
683        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
684        let r = _mm256_setr_ps(1., -3., 20., 1., 5., -71., 2., -25.);
685        assert_eq_m256(_mm256_fmsub_ps(a, b, c), r);
686    }
687
688    #[simd_test(enable = "fma")]
689    const fn test_mm_fmsub_sd() {
690        let a = _mm_setr_pd(1., 2.);
691        let b = _mm_setr_pd(5., 3.);
692        let c = _mm_setr_pd(4., 9.);
693        let r = _mm_setr_pd(1., 2.);
694        assert_eq_m128d(_mm_fmsub_sd(a, b, c), r);
695    }
696
697    #[simd_test(enable = "fma")]
698    const fn test_mm_fmsub_ss() {
699        let a = _mm_setr_ps(1., 2., 3., 4.);
700        let b = _mm_setr_ps(5., 3., 7., 2.);
701        let c = _mm_setr_ps(4., 9., 1., 7.);
702        let r = _mm_setr_ps(1., 2., 3., 4.);
703        assert_eq_m128(_mm_fmsub_ss(a, b, c), r);
704    }
705
706    #[simd_test(enable = "fma")]
707    const fn test_mm_fmsubadd_pd() {
708        let a = _mm_setr_pd(1., 2.);
709        let b = _mm_setr_pd(5., 3.);
710        let c = _mm_setr_pd(4., 9.);
711        let r = _mm_setr_pd(9., -3.);
712        assert_eq_m128d(_mm_fmsubadd_pd(a, b, c), r);
713    }
714
715    #[simd_test(enable = "fma")]
716    const fn test_mm256_fmsubadd_pd() {
717        let a = _mm256_setr_pd(1., 2., 3., 4.);
718        let b = _mm256_setr_pd(5., 3., 7., 2.);
719        let c = _mm256_setr_pd(4., 9., 1., 7.);
720        let r = _mm256_setr_pd(9., -3., 22., 1.);
721        assert_eq_m256d(_mm256_fmsubadd_pd(a, b, c), r);
722    }
723
724    #[simd_test(enable = "fma")]
725    const fn test_mm_fmsubadd_ps() {
726        let a = _mm_setr_ps(1., 2., 3., 4.);
727        let b = _mm_setr_ps(5., 3., 7., 2.);
728        let c = _mm_setr_ps(4., 9., 1., 7.);
729        let r = _mm_setr_ps(9., -3., 22., 1.);
730        assert_eq_m128(_mm_fmsubadd_ps(a, b, c), r);
731    }
732
733    #[simd_test(enable = "fma")]
734    const fn test_mm256_fmsubadd_ps() {
735        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
736        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
737        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
738        let r = _mm256_setr_ps(9., -3., 22., 1., -5., -71., -2., -25.);
739        assert_eq_m256(_mm256_fmsubadd_ps(a, b, c), r);
740    }
741
742    #[simd_test(enable = "fma")]
743    const fn test_mm_fnmadd_pd() {
744        let a = _mm_setr_pd(1., 2.);
745        let b = _mm_setr_pd(5., 3.);
746        let c = _mm_setr_pd(4., 9.);
747        let r = _mm_setr_pd(-1., 3.);
748        assert_eq_m128d(_mm_fnmadd_pd(a, b, c), r);
749    }
750
751    #[simd_test(enable = "fma")]
752    const fn test_mm256_fnmadd_pd() {
753        let a = _mm256_setr_pd(1., 2., 3., 4.);
754        let b = _mm256_setr_pd(5., 3., 7., 2.);
755        let c = _mm256_setr_pd(4., 9., 1., 7.);
756        let r = _mm256_setr_pd(-1., 3., -20., -1.);
757        assert_eq_m256d(_mm256_fnmadd_pd(a, b, c), r);
758    }
759
760    #[simd_test(enable = "fma")]
761    const fn test_mm_fnmadd_ps() {
762        let a = _mm_setr_ps(1., 2., 3., 4.);
763        let b = _mm_setr_ps(5., 3., 7., 2.);
764        let c = _mm_setr_ps(4., 9., 1., 7.);
765        let r = _mm_setr_ps(-1., 3., -20., -1.);
766        assert_eq_m128(_mm_fnmadd_ps(a, b, c), r);
767    }
768
769    #[simd_test(enable = "fma")]
770    const fn test_mm256_fnmadd_ps() {
771        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
772        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
773        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
774        let r = _mm256_setr_ps(-1., 3., -20., -1., -5., 71., -2., 25.);
775        assert_eq_m256(_mm256_fnmadd_ps(a, b, c), r);
776    }
777
778    #[simd_test(enable = "fma")]
779    const fn test_mm_fnmadd_sd() {
780        let a = _mm_setr_pd(1., 2.);
781        let b = _mm_setr_pd(5., 3.);
782        let c = _mm_setr_pd(4., 9.);
783        let r = _mm_setr_pd(-1., 2.);
784        assert_eq_m128d(_mm_fnmadd_sd(a, b, c), r);
785    }
786
787    #[simd_test(enable = "fma")]
788    const fn test_mm_fnmadd_ss() {
789        let a = _mm_setr_ps(1., 2., 3., 4.);
790        let b = _mm_setr_ps(5., 3., 7., 2.);
791        let c = _mm_setr_ps(4., 9., 1., 7.);
792        let r = _mm_setr_ps(-1., 2., 3., 4.);
793        assert_eq_m128(_mm_fnmadd_ss(a, b, c), r);
794    }
795
796    #[simd_test(enable = "fma")]
797    const fn test_mm_fnmsub_pd() {
798        let a = _mm_setr_pd(1., 2.);
799        let b = _mm_setr_pd(5., 3.);
800        let c = _mm_setr_pd(4., 9.);
801        let r = _mm_setr_pd(-9., -15.);
802        assert_eq_m128d(_mm_fnmsub_pd(a, b, c), r);
803    }
804
805    #[simd_test(enable = "fma")]
806    const fn test_mm256_fnmsub_pd() {
807        let a = _mm256_setr_pd(1., 2., 3., 4.);
808        let b = _mm256_setr_pd(5., 3., 7., 2.);
809        let c = _mm256_setr_pd(4., 9., 1., 7.);
810        let r = _mm256_setr_pd(-9., -15., -22., -15.);
811        assert_eq_m256d(_mm256_fnmsub_pd(a, b, c), r);
812    }
813
814    #[simd_test(enable = "fma")]
815    const fn test_mm_fnmsub_ps() {
816        let a = _mm_setr_ps(1., 2., 3., 4.);
817        let b = _mm_setr_ps(5., 3., 7., 2.);
818        let c = _mm_setr_ps(4., 9., 1., 7.);
819        let r = _mm_setr_ps(-9., -15., -22., -15.);
820        assert_eq_m128(_mm_fnmsub_ps(a, b, c), r);
821    }
822
823    #[simd_test(enable = "fma")]
824    const fn test_mm256_fnmsub_ps() {
825        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
826        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
827        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
828        let r = _mm256_setr_ps(-9., -15., -22., -15., 5., 49., 2., 31.);
829        assert_eq_m256(_mm256_fnmsub_ps(a, b, c), r);
830    }
831
832    #[simd_test(enable = "fma")]
833    const fn test_mm_fnmsub_sd() {
834        let a = _mm_setr_pd(1., 2.);
835        let b = _mm_setr_pd(5., 3.);
836        let c = _mm_setr_pd(4., 9.);
837        let r = _mm_setr_pd(-9., 2.);
838        assert_eq_m128d(_mm_fnmsub_sd(a, b, c), r);
839    }
840
841    #[simd_test(enable = "fma")]
842    const fn test_mm_fnmsub_ss() {
843        let a = _mm_setr_ps(1., 2., 3., 4.);
844        let b = _mm_setr_ps(5., 3., 7., 2.);
845        let c = _mm_setr_ps(4., 9., 1., 7.);
846        let r = _mm_setr_ps(-9., 2., 3., 4.);
847        assert_eq_m128(_mm_fnmsub_ss(a, b, c), r);
848    }
849}