core/stdarch/crates/core_arch/src/x86/
avx512bf16.rs

1//! [AVX512BF16 intrinsics].
2//!
3//! [AVX512BF16 intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769&avx512techs=AVX512_BF16
4
5use crate::arch::asm;
6use crate::core_arch::{simd::*, x86::*};
7use crate::intrinsics::simd::*;
8
9#[cfg(test)]
10use stdarch_test::assert_instr;
11
12#[allow(improper_ctypes)]
13unsafe extern "C" {
14    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.128"]
15    fn cvtne2ps2bf16(a: f32x4, b: f32x4) -> i16x8;
16    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.256"]
17    fn cvtne2ps2bf16_256(a: f32x8, b: f32x8) -> i16x16;
18    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.512"]
19    fn cvtne2ps2bf16_512(a: f32x16, b: f32x16) -> i16x32;
20    #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.256"]
21    fn cvtneps2bf16_256(a: f32x8) -> i16x8;
22    #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.512"]
23    fn cvtneps2bf16_512(a: f32x16) -> i16x16;
24    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.128"]
25    fn dpbf16ps(a: f32x4, b: i16x8, c: i16x8) -> f32x4;
26    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.256"]
27    fn dpbf16ps_256(a: f32x8, b: i16x16, c: i16x16) -> f32x8;
28    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.512"]
29    fn dpbf16ps_512(a: f32x16, b: i16x32, c: i16x32) -> f32x16;
30}
31
32/// Convert packed single-precision (32-bit) floating-point elements in two 128-bit vectors
33/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
34/// 128-bit wide vector.
35/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_cvtne2ps_pbh)
36#[inline]
37#[target_feature(enable = "avx512bf16,avx512vl")]
38#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
39#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
40pub fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh {
41    unsafe { transmute(cvtne2ps2bf16(a.as_f32x4(), b.as_f32x4())) }
42}
43
44/// Convert packed single-precision (32-bit) floating-point elements in two vectors
45/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
46/// in single vector dst using writemask k (elements are copied from src when the
47/// corresponding mask bit is not set).
48/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_mask_cvtne2ps_pbh)
49#[inline]
50#[target_feature(enable = "avx512bf16,avx512vl")]
51#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
52#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
53pub fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __m128) -> __m128bh {
54    unsafe {
55        let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
56        transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
57    }
58}
59
60/// Convert packed single-precision (32-bit) floating-point elements in two vectors
61/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
62/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
63/// mask bit is not set).
64/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_maskz_cvtne2ps_pbh)
65#[inline]
66#[target_feature(enable = "avx512bf16,avx512vl")]
67#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
68#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
69pub fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh {
70    unsafe {
71        let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
72        transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
73    }
74}
75
76/// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors
77/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
78/// 256-bit wide vector.
79/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_cvtne2ps_pbh)
80#[inline]
81#[target_feature(enable = "avx512bf16,avx512vl")]
82#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
83#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
84pub fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh {
85    unsafe { transmute(cvtne2ps2bf16_256(a.as_f32x8(), b.as_f32x8())) }
86}
87
88/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
89/// to packed BF16 (16-bit) floating-point elements and store the results in single vector
90/// dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
91/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_mask_cvtne2ps_pbh)
92#[inline]
93#[target_feature(enable = "avx512bf16,avx512vl")]
94#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
95#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
96pub fn _mm256_mask_cvtne2ps_pbh(src: __m256bh, k: __mmask16, a: __m256, b: __m256) -> __m256bh {
97    unsafe {
98        let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
99        transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
100    }
101}
102
103/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
104/// to packed BF16 (16-bit) floating-point elements, and store the results in single vector
105/// dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtne2ps_pbh)
107#[inline]
108#[target_feature(enable = "avx512bf16,avx512vl")]
109#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
110#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
111pub fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh {
112    unsafe {
113        let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
114        transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
115    }
116}
117
118/// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors
119/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
120/// 512-bit wide vector.
121/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_cvtne2ps_pbh)
122#[inline]
123#[target_feature(enable = "avx512bf16,avx512f")]
124#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
125#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
126pub fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh {
127    unsafe { transmute(cvtne2ps2bf16_512(a.as_f32x16(), b.as_f32x16())) }
128}
129
130/// Convert packed single-precision (32-bit) floating-point elements in two vectors
131/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
132/// in single vector dst using writemask k (elements are copied from src when the
133/// corresponding mask bit is not set).
134/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_mask_cvtne2ps_pbh)
135#[inline]
136#[target_feature(enable = "avx512bf16,avx512f")]
137#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
138#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
139pub fn _mm512_mask_cvtne2ps_pbh(src: __m512bh, k: __mmask32, a: __m512, b: __m512) -> __m512bh {
140    unsafe {
141        let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
142        transmute(simd_select_bitmask(k, cvt, src.as_u16x32()))
143    }
144}
145
146/// Convert packed single-precision (32-bit) floating-point elements in two vectors
147/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
148/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
149/// mask bit is not set).
150/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtne2ps_pbh)
151#[inline]
152#[target_feature(enable = "avx512bf16,avx512f")]
153#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
154#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
155pub fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh {
156    unsafe {
157        let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
158        transmute(simd_select_bitmask(k, cvt, u16x32::ZERO))
159    }
160}
161
162/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
163/// floating-point elements, and store the results in dst.
164/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_cvtneps_pbh)
165#[inline]
166#[target_feature(enable = "avx512bf16,avx512vl")]
167#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
168#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
169pub fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh {
170    unsafe { transmute(cvtneps2bf16_256(a.as_f32x8())) }
171}
172
173/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
174/// floating-point elements, and store the results in dst using writemask k
175/// (elements are copied from src when the corresponding mask bit is not set).
176/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_cvtneps_pbh)
177#[inline]
178#[target_feature(enable = "avx512bf16,avx512vl")]
179#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
180#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
181pub fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) -> __m128bh {
182    unsafe {
183        let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
184        transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
185    }
186}
187
188/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
189/// floating-point elements, and store the results in dst using zeromask k
190/// (elements are zeroed out when the corresponding mask bit is not set).
191/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtneps_pbh)
192#[inline]
193#[target_feature(enable = "avx512bf16,avx512vl")]
194#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
195#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
196pub fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
197    unsafe {
198        let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
199        transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
200    }
201}
202
203/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
204/// floating-point elements, and store the results in dst.
205/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_cvtneps_pbh)
206#[inline]
207#[target_feature(enable = "avx512bf16,avx512f")]
208#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
209#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
210pub fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh {
211    unsafe { transmute(cvtneps2bf16_512(a.as_f32x16())) }
212}
213
214/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
215/// floating-point elements, and store the results in dst using writemask k
216/// (elements are copied from src when the corresponding mask bit is not set).
217/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_cvtneps_pbh)
218#[inline]
219#[target_feature(enable = "avx512bf16,avx512f")]
220#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
221#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
222pub fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) -> __m256bh {
223    unsafe {
224        let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
225        transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
226    }
227}
228
229/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
230/// floating-point elements, and store the results in dst using zeromask k
231/// (elements are zeroed out when the corresponding mask bit is not set).
232/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtneps_pbh)
233#[inline]
234#[target_feature(enable = "avx512bf16,avx512f")]
235#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
236#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
237pub fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
238    unsafe {
239        let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
240        transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
241    }
242}
243
244/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
245/// accumulating the intermediate single-precision (32-bit) floating-point elements
246/// with elements in src, and store the results in dst.
247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_dpbf16_ps)
248#[inline]
249#[target_feature(enable = "avx512bf16,avx512vl")]
250#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
251#[cfg_attr(test, assert_instr("vdpbf16ps"))]
252pub fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
253    unsafe { transmute(dpbf16ps(src.as_f32x4(), a.as_i16x8(), b.as_i16x8())) }
254}
255
256/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
257/// accumulating the intermediate single-precision (32-bit) floating-point elements
258/// with elements in src, and store the results in dst using writemask k
259/// (elements are copied from src when the corresponding mask bit is not set).
260/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_mask_dpbf16_ps)
261#[inline]
262#[target_feature(enable = "avx512bf16,avx512vl")]
263#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
264#[cfg_attr(test, assert_instr("vdpbf16ps"))]
265pub fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 {
266    unsafe {
267        let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
268        transmute(simd_select_bitmask(k, rst, src.as_f32x4()))
269    }
270}
271
272/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
273/// accumulating the intermediate single-precision (32-bit) floating-point elements
274/// with elements in src, and store the results in dst using zeromask k
275/// (elements are zeroed out when the corresponding mask bit is not set).
276/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_maskz_dpbf16_ps)
277#[inline]
278#[target_feature(enable = "avx512bf16,avx512vl")]
279#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
280#[cfg_attr(test, assert_instr("vdpbf16ps"))]
281pub fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
282    unsafe {
283        let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
284        let zero = _mm_set1_ps(0.0_f32).as_f32x4();
285        transmute(simd_select_bitmask(k, rst, zero))
286    }
287}
288
289/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
290/// accumulating the intermediate single-precision (32-bit) floating-point elements
291/// with elements in src, and store the results in dst.
292/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_dpbf16_ps)
293#[inline]
294#[target_feature(enable = "avx512bf16,avx512vl")]
295#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
296#[cfg_attr(test, assert_instr("vdpbf16ps"))]
297pub fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
298    unsafe { transmute(dpbf16ps_256(src.as_f32x8(), a.as_i16x16(), b.as_i16x16())) }
299}
300
301/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
302/// accumulating the intermediate single-precision (32-bit) floating-point elements
303/// with elements in src, and store the results in dst using writemask k
304/// (elements are copied from src when the corresponding mask bit is not set).
305/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_dpbf16_ps)
306#[inline]
307#[target_feature(enable = "avx512bf16,avx512vl")]
308#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
309#[cfg_attr(test, assert_instr("vdpbf16ps"))]
310pub fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 {
311    unsafe {
312        let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
313        transmute(simd_select_bitmask(k, rst, src.as_f32x8()))
314    }
315}
316
317/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
318/// accumulating the intermediate single-precision (32-bit) floating-point elements
319/// with elements in src, and store the results in dst using zeromask k
320/// (elements are zeroed out when the corresponding mask bit is not set).
321/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_dpbf16_ps)
322#[inline]
323#[target_feature(enable = "avx512bf16,avx512vl")]
324#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
325#[cfg_attr(test, assert_instr("vdpbf16ps"))]
326pub fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
327    unsafe {
328        let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
329        transmute(simd_select_bitmask(k, rst, f32x8::ZERO))
330    }
331}
332
333/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
334/// accumulating the intermediate single-precision (32-bit) floating-point elements
335/// with elements in src, and store the results in dst.Compute dot-product of BF16 (16-bit)
336/// floating-point pairs in a and b, accumulating the intermediate single-precision (32-bit)
337/// floating-point elements with elements in src, and store the results in dst.
338/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_dpbf16_ps)
339#[inline]
340#[target_feature(enable = "avx512bf16,avx512f")]
341#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
342#[cfg_attr(test, assert_instr("vdpbf16ps"))]
343pub fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
344    unsafe { transmute(dpbf16ps_512(src.as_f32x16(), a.as_i16x32(), b.as_i16x32())) }
345}
346
347/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
348/// accumulating the intermediate single-precision (32-bit) floating-point elements
349/// with elements in src, and store the results in dst using writemask k
350/// (elements are copied from src when the corresponding mask bit is not set).
351/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_dpbf16_ps)
352#[inline]
353#[target_feature(enable = "avx512bf16,avx512f")]
354#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
355#[cfg_attr(test, assert_instr("vdpbf16ps"))]
356pub fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 {
357    unsafe {
358        let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
359        transmute(simd_select_bitmask(k, rst, src.as_f32x16()))
360    }
361}
362
363/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
364/// accumulating the intermediate single-precision (32-bit) floating-point elements
365/// with elements in src, and store the results in dst using zeromask k
366/// (elements are zeroed out when the corresponding mask bit is not set).
367/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_dpbf16_ps)
368#[inline]
369#[target_feature(enable = "avx512bf16,avx512f")]
370#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
371#[cfg_attr(test, assert_instr("vdpbf16ps"))]
372pub fn _mm512_maskz_dpbf16_ps(k: __mmask16, src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
373    unsafe {
374        let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
375        transmute(simd_select_bitmask(k, rst, f32x16::ZERO))
376    }
377}
378
379/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
380/// floating-point elements, and store the results in dst.
381///
382/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpbh_ps)
383#[inline]
384#[target_feature(enable = "avx512bf16,avx512f")]
385#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
386pub fn _mm512_cvtpbh_ps(a: __m256bh) -> __m512 {
387    unsafe { _mm512_castsi512_ps(_mm512_slli_epi32::<16>(_mm512_cvtepi16_epi32(transmute(a)))) }
388}
389
390/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
391/// floating-point elements, and store the results in dst using writemask k (elements are copied
392/// from src when the corresponding mask bit is not set).
393///
394/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpbh_ps)
395#[inline]
396#[target_feature(enable = "avx512bf16,avx512f")]
397#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
398pub fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> __m512 {
399    unsafe {
400        let cvt = _mm512_cvtpbh_ps(a);
401        transmute(simd_select_bitmask(k, cvt.as_f32x16(), src.as_f32x16()))
402    }
403}
404
405/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
406/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
407/// when the corresponding mask bit is not set).
408///
409/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpbh_ps)
410#[inline]
411#[target_feature(enable = "avx512bf16,avx512f")]
412#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
413pub fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 {
414    unsafe {
415        let cvt = _mm512_cvtpbh_ps(a);
416        transmute(simd_select_bitmask(k, cvt.as_f32x16(), f32x16::ZERO))
417    }
418}
419
420/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
421/// floating-point elements, and store the results in dst.
422///
423/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpbh_ps)
424#[inline]
425#[target_feature(enable = "avx512bf16,avx512vl")]
426#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
427pub fn _mm256_cvtpbh_ps(a: __m128bh) -> __m256 {
428    unsafe { _mm256_castsi256_ps(_mm256_slli_epi32::<16>(_mm256_cvtepi16_epi32(transmute(a)))) }
429}
430
431/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
432/// floating-point elements, and store the results in dst using writemask k (elements are copied
433/// from src when the corresponding mask bit is not set).
434///
435/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpbh_ps)
436#[inline]
437#[target_feature(enable = "avx512bf16,avx512vl")]
438#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
439pub fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __m256 {
440    unsafe {
441        let cvt = _mm256_cvtpbh_ps(a);
442        transmute(simd_select_bitmask(k, cvt.as_f32x8(), src.as_f32x8()))
443    }
444}
445
446/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
447/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
448/// when the corresponding mask bit is not set).
449///
450/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpbh_ps)
451#[inline]
452#[target_feature(enable = "avx512bf16,avx512vl")]
453#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
454pub fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 {
455    unsafe {
456        let cvt = _mm256_cvtpbh_ps(a);
457        transmute(simd_select_bitmask(k, cvt.as_f32x8(), f32x8::ZERO))
458    }
459}
460
461/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
462/// elements, and store the results in dst.
463///
464/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpbh_ps)
465#[inline]
466#[target_feature(enable = "avx512bf16,avx512vl")]
467#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
468pub fn _mm_cvtpbh_ps(a: __m128bh) -> __m128 {
469    unsafe { _mm_castsi128_ps(_mm_slli_epi32::<16>(_mm_cvtepi16_epi32(transmute(a)))) }
470}
471
472/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
473/// elements, and store the results in dst using writemask k (elements are copied from src when the corresponding
474/// mask bit is not set).
475///
476/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpbh_ps)
477#[inline]
478#[target_feature(enable = "avx512bf16,avx512vl")]
479#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
480pub fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m128 {
481    unsafe {
482        let cvt = _mm_cvtpbh_ps(a);
483        transmute(simd_select_bitmask(k, cvt.as_f32x4(), src.as_f32x4()))
484    }
485}
486
487/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
488/// elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
489/// mask bit is not set).
490///
491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpbh_ps)
492#[inline]
493#[target_feature(enable = "avx512bf16,avx512vl")]
494#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
495pub fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 {
496    unsafe {
497        let cvt = _mm_cvtpbh_ps(a);
498        transmute(simd_select_bitmask(k, cvt.as_f32x4(), f32x4::ZERO))
499    }
500}
501
502/// Converts a single BF16 (16-bit) floating-point element in a to a single-precision (32-bit) floating-point
503/// element, and store the result in dst.
504///
505/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsbh_ss)
506#[inline]
507#[target_feature(enable = "avx512bf16,avx512f")]
508#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
509pub fn _mm_cvtsbh_ss(a: bf16) -> f32 {
510    f32::from_bits((a.to_bits() as u32) << 16)
511}
512
513/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
514/// floating-point elements, and store the results in dst.
515///
516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)
517#[inline]
518#[target_feature(enable = "avx512bf16,avx512vl")]
519#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
520#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
521pub fn _mm_cvtneps_pbh(a: __m128) -> __m128bh {
522    unsafe {
523        let mut dst: __m128bh;
524        asm!(
525            "vcvtneps2bf16 {dst}, {src}",
526            dst = lateout(xmm_reg) dst,
527            src = in(xmm_reg) a,
528            options(pure, nomem, nostack, preserves_flags)
529        );
530        dst
531    }
532}
533
534/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
535/// floating-point elements, and store the results in dst using writemask k (elements are copied
536/// from src when the corresponding mask bit is not set).
537///
538/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtneps_pbh)
539#[inline]
540#[target_feature(enable = "avx512bf16,avx512vl")]
541#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
542#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
543pub fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m128bh {
544    unsafe {
545        let mut dst = src;
546        asm!(
547            "vcvtneps2bf16 {dst}{{{k}}},{src}",
548            dst = inlateout(xmm_reg) dst,
549            src = in(xmm_reg) a,
550            k = in(kreg) k,
551            options(pure, nomem, nostack, preserves_flags)
552        );
553        dst
554    }
555}
556
557/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
558/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
559/// when the corresponding mask bit is not set).
560///
561/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtneps_pbh)
562#[inline]
563#[target_feature(enable = "avx512bf16,avx512vl")]
564#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
565#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
566pub fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh {
567    unsafe {
568        let mut dst: __m128bh;
569        asm!(
570            "vcvtneps2bf16 {dst}{{{k}}}{{z}},{src}",
571            dst = lateout(xmm_reg) dst,
572            src = in(xmm_reg) a,
573            k = in(kreg) k,
574            options(pure, nomem, nostack, preserves_flags)
575        );
576        dst
577    }
578}
579
580/// Converts a single-precision (32-bit) floating-point element in a to a BF16 (16-bit) floating-point
581/// element, and store the result in dst.
582///
583/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtness_sbh)
584#[inline]
585#[target_feature(enable = "avx512bf16,avx512vl")]
586#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
587pub fn _mm_cvtness_sbh(a: f32) -> bf16 {
588    unsafe {
589        let value: u16 = simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0);
590        bf16::from_bits(value)
591    }
592}
593
594#[cfg(test)]
595mod tests {
596    use crate::core_arch::simd::{f32x4, f32x8, f32x16, u16x4, u16x8, u16x16, u16x32};
597    use crate::{
598        core_arch::x86::*,
599        mem::{transmute, transmute_copy},
600    };
601    use stdarch_test::simd_test;
602
603    #[simd_test(enable = "avx512bf16,avx512vl")]
604    fn test_mm_cvtne2ps_pbh() {
605        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
606        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
607        let a = f32x4::from_array(a_array).as_m128();
608        let b = f32x4::from_array(b_array).as_m128();
609        let c: __m128bh = _mm_cvtne2ps_pbh(a, b);
610        let result = *c.as_u16x8().as_array();
611        #[rustfmt::skip]
612        let expected_result: [u16; 8] = [
613            0b1_10000110_0110010,
614            0b1_10000010_0101000,
615            0b1_10000000_1110000,
616            0b1_10000100_1001001,
617            0b0_10000110_0110010,
618            0b0_10000010_0101000,
619            0b0_10000000_1110000,
620            0b0_10000100_1001001,
621        ];
622        assert_eq!(result, expected_result);
623    }
624
625    #[simd_test(enable = "avx512bf16,avx512vl")]
626    fn test_mm_mask_cvtne2ps_pbh() {
627        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
628        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
629        #[rustfmt::skip]
630        let src_array: [u16; 8] = [
631            0b0_10000110_0110010,
632            0b0_10000010_0101000,
633            0b0_10000000_1110000,
634            0b0_10000100_1001001,
635            0b0_10000110_0110010,
636            0b0_10000010_0101000,
637            0b0_10000000_1110000,
638            0b0_10000100_1001001,
639        ];
640        let src = u16x8::from_array(src_array).as_m128bh();
641        let a = f32x4::from_array(a_array).as_m128();
642        let b = f32x4::from_array(b_array).as_m128();
643        let k: __mmask8 = 0b1111_1111;
644        let c: __m128bh = _mm_mask_cvtne2ps_pbh(src, k, a, b);
645        let result = *c.as_u16x8().as_array();
646        #[rustfmt::skip]
647        let expected_result: [u16; 8] = [
648            0b1_10000110_0110010,
649            0b1_10000010_0101000,
650            0b1_10000000_1110000,
651            0b1_10000100_1001001,
652            0b0_10000110_0110010,
653            0b0_10000010_0101000,
654            0b0_10000000_1110000,
655            0b0_10000100_1001001,
656        ];
657        assert_eq!(result, expected_result);
658        let k = 0b0000_0000;
659        let c = _mm_mask_cvtne2ps_pbh(src, k, a, b);
660        let result = *c.as_u16x8().as_array();
661        let expected_result = src_array;
662        assert_eq!(result, expected_result);
663    }
664
665    #[simd_test(enable = "avx512bf16,avx512vl")]
666    fn test_mm_maskz_cvtne2ps_pbh() {
667        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
668        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
669        let a = f32x4::from_array(a_array).as_m128();
670        let b = f32x4::from_array(b_array).as_m128();
671        let k: __mmask8 = 0b1111_1111;
672        let c: __m128bh = _mm_maskz_cvtne2ps_pbh(k, a, b);
673        let result = *c.as_u16x8().as_array();
674        #[rustfmt::skip]
675        let expected_result: [u16; 8] = [
676            0b1_10000110_0110010,
677            0b1_10000010_0101000,
678            0b1_10000000_1110000,
679            0b1_10000100_1001001,
680            0b0_10000110_0110010,
681            0b0_10000010_0101000,
682            0b0_10000000_1110000,
683            0b0_10000100_1001001,
684        ];
685        assert_eq!(result, expected_result);
686        let k = 0b0011_1100;
687        let c = _mm_maskz_cvtne2ps_pbh(k, a, b);
688        let result = *c.as_u16x8().as_array();
689        #[rustfmt::skip]
690        let expected_result: [u16; 8] = [
691            0,
692            0,
693            0b1_10000000_1110000,
694            0b1_10000100_1001001,
695            0b0_10000110_0110010,
696            0b0_10000010_0101000,
697            0,
698            0,
699        ];
700        assert_eq!(result, expected_result);
701    }
702
703    #[simd_test(enable = "avx512bf16,avx512vl")]
704    fn test_mm256_cvtne2ps_pbh() {
705        #[rustfmt::skip]
706        let a_array = [
707            178.125_f32,
708            10.5_f32,
709            3.75_f32,
710            50.25_f32,
711            16.5_f32,
712            255.11_f32,
713            1000.158_f32,
714            575.575_f32,
715        ];
716        let b_array = [
717            -178.125_f32,
718            -10.5_f32,
719            -3.75_f32,
720            -50.25_f32,
721            -16.5_f32,
722            -255.11_f32,
723            -1000.158_f32,
724            -575.575_f32,
725        ];
726        let a = f32x8::from_array(a_array).as_m256();
727        let b = f32x8::from_array(b_array).as_m256();
728        let c: __m256bh = _mm256_cvtne2ps_pbh(a, b);
729        let result = *c.as_u16x16().as_array();
730        #[rustfmt::skip]
731        let expected_result: [u16; 16] = [
732            0b1_10000110_0110010,
733            0b1_10000010_0101000,
734            0b1_10000000_1110000,
735            0b1_10000100_1001001,
736            0b1_10000011_0000100,
737            0b1_10000110_1111111,
738            0b1_10001000_1111010,
739            0b1_10001000_0010000,
740            0b0_10000110_0110010,
741            0b0_10000010_0101000,
742            0b0_10000000_1110000,
743            0b0_10000100_1001001,
744            0b0_10000011_0000100,
745            0b0_10000110_1111111,
746            0b0_10001000_1111010,
747            0b0_10001000_0010000,
748        ];
749        assert_eq!(result, expected_result);
750    }
751
752    #[simd_test(enable = "avx512bf16,avx512vl")]
753    fn test_mm256_mask_cvtne2ps_pbh() {
754        #[rustfmt::skip]
755        let a_array = [
756            178.125_f32,
757            10.5_f32,
758            3.75_f32,
759            50.25_f32,
760            16.5_f32,
761            255.11_f32,
762            1000.158_f32,
763            575.575_f32,
764        ];
765        let b_array = [
766            -178.125_f32,
767            -10.5_f32,
768            -3.75_f32,
769            -50.25_f32,
770            -16.5_f32,
771            -255.11_f32,
772            -1000.158_f32,
773            -575.575_f32,
774        ];
775        let src_array: [u16; 16] = [
776            0b0_10000110_0110010,
777            0b0_10000010_0101000,
778            0b0_10000000_1110000,
779            0b0_10000100_1001001,
780            0b0_10000110_0110010,
781            0b0_10000010_0101000,
782            0b0_10000000_1110000,
783            0b0_10000100_1001001,
784            0b0_10000110_0110010,
785            0b0_10000010_0101000,
786            0b0_10000000_1110000,
787            0b0_10000100_1001001,
788            0b0_10000110_0110010,
789            0b0_10000010_0101000,
790            0b0_10000000_1110000,
791            0b0_10000100_1001001,
792        ];
793        let src = u16x16::from_array(src_array).as_m256bh();
794        let a = f32x8::from_array(a_array).as_m256();
795        let b = f32x8::from_array(b_array).as_m256();
796        let k: __mmask16 = 0xffff;
797        let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
798        let result = *c.as_u16x16().as_array();
799        #[rustfmt::skip]
800        let expected_result: [u16; 16] = [
801            0b1_10000110_0110010,
802            0b1_10000010_0101000,
803            0b1_10000000_1110000,
804            0b1_10000100_1001001,
805            0b1_10000011_0000100,
806            0b1_10000110_1111111,
807            0b1_10001000_1111010,
808            0b1_10001000_0010000,
809            0b0_10000110_0110010,
810            0b0_10000010_0101000,
811            0b0_10000000_1110000,
812            0b0_10000100_1001001,
813            0b0_10000011_0000100,
814            0b0_10000110_1111111,
815            0b0_10001000_1111010,
816            0b0_10001000_0010000,
817        ];
818        assert_eq!(result, expected_result);
819        let k: __mmask16 = 0;
820        let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
821        let result = *c.as_u16x16().as_array();
822        let expected_result = src_array;
823        assert_eq!(result, expected_result);
824    }
825
826    #[simd_test(enable = "avx512bf16,avx512vl")]
827    fn test_mm256_maskz_cvtne2ps_pbh() {
828        #[rustfmt::skip]
829        let a_array = [
830            178.125_f32,
831            10.5_f32,
832            3.75_f32,
833            50.25_f32,
834            16.5_f32,
835            255.11_f32,
836            1000.158_f32,
837            575.575_f32,
838        ];
839        let b_array = [
840            -178.125_f32,
841            -10.5_f32,
842            -3.75_f32,
843            -50.25_f32,
844            -16.5_f32,
845            -255.11_f32,
846            -1000.158_f32,
847            -575.575_f32,
848        ];
849        let a = f32x8::from_array(a_array).as_m256();
850        let b = f32x8::from_array(b_array).as_m256();
851        let k: __mmask16 = 0xffff;
852        let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
853        let result = *c.as_u16x16().as_array();
854        #[rustfmt::skip]
855        let expected_result: [u16; 16] = [
856            0b1_10000110_0110010,
857            0b1_10000010_0101000,
858            0b1_10000000_1110000,
859            0b1_10000100_1001001,
860            0b1_10000011_0000100,
861            0b1_10000110_1111111,
862            0b1_10001000_1111010,
863            0b1_10001000_0010000,
864            0b0_10000110_0110010,
865            0b0_10000010_0101000,
866            0b0_10000000_1110000,
867            0b0_10000100_1001001,
868            0b0_10000011_0000100,
869            0b0_10000110_1111111,
870            0b0_10001000_1111010,
871            0b0_10001000_0010000,
872        ];
873        assert_eq!(result, expected_result);
874        let k: __mmask16 = 0b0110_1100_0011_0110;
875        let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
876        let result = *c.as_u16x16().as_array();
877        #[rustfmt::skip]
878        let expected_result: [u16; 16] = [
879            0,
880            0b1_10000010_0101000,
881            0b1_10000000_1110000,
882            0,
883            0b1_10000011_0000100,
884            0b1_10000110_1111111,
885            0,
886            0,
887            0,
888            0,
889            0b0_10000000_1110000,
890            0b0_10000100_1001001,
891            0,
892            0b0_10000110_1111111,
893            0b0_10001000_1111010,
894            0,
895        ];
896        assert_eq!(result, expected_result);
897    }
898
899    #[simd_test(enable = "avx512bf16,avx512f")]
900    fn test_mm512_cvtne2ps_pbh() {
901        #[rustfmt::skip]
902        let a_array = [
903            178.125_f32,
904            10.5_f32,
905            3.75_f32,
906            50.25_f32,
907            16.5_f32,
908            255.11_f32,
909            1000.158_f32,
910            575.575_f32,
911            178.125_f32,
912            10.5_f32,
913            3.75_f32,
914            50.25_f32,
915            16.5_f32,
916            255.11_f32,
917            1000.158_f32,
918            575.575_f32,
919        ];
920        let b_array = [
921            -178.125_f32,
922            -10.5_f32,
923            -3.75_f32,
924            -50.25_f32,
925            -16.5_f32,
926            -255.11_f32,
927            -1000.158_f32,
928            -575.575_f32,
929            -178.125_f32,
930            -10.5_f32,
931            -3.75_f32,
932            -50.25_f32,
933            -16.5_f32,
934            -255.11_f32,
935            -1000.158_f32,
936            -575.575_f32,
937        ];
938        let a = f32x16::from_array(a_array).as_m512();
939        let b = f32x16::from_array(b_array).as_m512();
940        let c: __m512bh = _mm512_cvtne2ps_pbh(a, b);
941        let result = *c.as_u16x32().as_array();
942        #[rustfmt::skip]
943        let expected_result: [u16; 32] = [
944            0b1_10000110_0110010,
945            0b1_10000010_0101000,
946            0b1_10000000_1110000,
947            0b1_10000100_1001001,
948            0b1_10000011_0000100,
949            0b1_10000110_1111111,
950            0b1_10001000_1111010,
951            0b1_10001000_0010000,
952            0b1_10000110_0110010,
953            0b1_10000010_0101000,
954            0b1_10000000_1110000,
955            0b1_10000100_1001001,
956            0b1_10000011_0000100,
957            0b1_10000110_1111111,
958            0b1_10001000_1111010,
959            0b1_10001000_0010000,
960            0b0_10000110_0110010,
961            0b0_10000010_0101000,
962            0b0_10000000_1110000,
963            0b0_10000100_1001001,
964            0b0_10000011_0000100,
965            0b0_10000110_1111111,
966            0b0_10001000_1111010,
967            0b0_10001000_0010000,
968            0b0_10000110_0110010,
969            0b0_10000010_0101000,
970            0b0_10000000_1110000,
971            0b0_10000100_1001001,
972            0b0_10000011_0000100,
973            0b0_10000110_1111111,
974            0b0_10001000_1111010,
975            0b0_10001000_0010000,
976        ];
977        assert_eq!(result, expected_result);
978    }
979
980    #[simd_test(enable = "avx512bf16,avx512f")]
981    fn test_mm512_mask_cvtne2ps_pbh() {
982        #[rustfmt::skip]
983        let a_array = [
984            178.125_f32,
985            10.5_f32,
986            3.75_f32,
987            50.25_f32,
988            16.5_f32,
989            255.11_f32,
990            1000.158_f32,
991            575.575_f32,
992            178.125_f32,
993            10.5_f32,
994            3.75_f32,
995            50.25_f32,
996            16.5_f32,
997            255.11_f32,
998            1000.158_f32,
999            575.575_f32,
1000        ];
1001        let b_array = [
1002            -178.125_f32,
1003            -10.5_f32,
1004            -3.75_f32,
1005            -50.25_f32,
1006            -16.5_f32,
1007            -255.11_f32,
1008            -1000.158_f32,
1009            -575.575_f32,
1010            -178.125_f32,
1011            -10.5_f32,
1012            -3.75_f32,
1013            -50.25_f32,
1014            -16.5_f32,
1015            -255.11_f32,
1016            -1000.158_f32,
1017            -575.575_f32,
1018        ];
1019        let src_array: [u16; 32] = [
1020            0b0_10000110_0110010,
1021            0b0_10000010_0101000,
1022            0b0_10000000_1110000,
1023            0b0_10000100_1001001,
1024            0b0_10000110_0110010,
1025            0b0_10000010_0101000,
1026            0b0_10000000_1110000,
1027            0b0_10000100_1001001,
1028            0b0_10000110_0110010,
1029            0b0_10000010_0101000,
1030            0b0_10000000_1110000,
1031            0b0_10000100_1001001,
1032            0b0_10000110_0110010,
1033            0b0_10000010_0101000,
1034            0b0_10000000_1110000,
1035            0b0_10000100_1001001,
1036            0b0_10000110_0110010,
1037            0b0_10000010_0101000,
1038            0b0_10000000_1110000,
1039            0b0_10000100_1001001,
1040            0b0_10000110_0110010,
1041            0b0_10000010_0101000,
1042            0b0_10000000_1110000,
1043            0b0_10000100_1001001,
1044            0b0_10000110_0110010,
1045            0b0_10000010_0101000,
1046            0b0_10000000_1110000,
1047            0b0_10000100_1001001,
1048            0b0_10000110_0110010,
1049            0b0_10000010_0101000,
1050            0b0_10000000_1110000,
1051            0b0_10000100_1001001,
1052        ];
1053        let src = u16x32::from_array(src_array).as_m512bh();
1054        let a = f32x16::from_array(a_array).as_m512();
1055        let b = f32x16::from_array(b_array).as_m512();
1056        let k: __mmask32 = 0xffffffff;
1057        let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
1058        let result = *c.as_u16x32().as_array();
1059        #[rustfmt::skip]
1060        let expected_result: [u16; 32] = [
1061            0b1_10000110_0110010,
1062            0b1_10000010_0101000,
1063            0b1_10000000_1110000,
1064            0b1_10000100_1001001,
1065            0b1_10000011_0000100,
1066            0b1_10000110_1111111,
1067            0b1_10001000_1111010,
1068            0b1_10001000_0010000,
1069            0b1_10000110_0110010,
1070            0b1_10000010_0101000,
1071            0b1_10000000_1110000,
1072            0b1_10000100_1001001,
1073            0b1_10000011_0000100,
1074            0b1_10000110_1111111,
1075            0b1_10001000_1111010,
1076            0b1_10001000_0010000,
1077            0b0_10000110_0110010,
1078            0b0_10000010_0101000,
1079            0b0_10000000_1110000,
1080            0b0_10000100_1001001,
1081            0b0_10000011_0000100,
1082            0b0_10000110_1111111,
1083            0b0_10001000_1111010,
1084            0b0_10001000_0010000,
1085            0b0_10000110_0110010,
1086            0b0_10000010_0101000,
1087            0b0_10000000_1110000,
1088            0b0_10000100_1001001,
1089            0b0_10000011_0000100,
1090            0b0_10000110_1111111,
1091            0b0_10001000_1111010,
1092            0b0_10001000_0010000,
1093        ];
1094        assert_eq!(result, expected_result);
1095        let k: __mmask32 = 0;
1096        let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
1097        let result = *c.as_u16x32().as_array();
1098        let expected_result = src_array;
1099        assert_eq!(result, expected_result);
1100    }
1101
1102    #[simd_test(enable = "avx512bf16,avx512f")]
1103    fn test_mm512_maskz_cvtne2ps_pbh() {
1104        #[rustfmt::skip]
1105        let a_array = [
1106            178.125_f32,
1107            10.5_f32,
1108            3.75_f32,
1109            50.25_f32,
1110            16.5_f32,
1111            255.11_f32,
1112            1000.158_f32,
1113            575.575_f32,
1114            178.125_f32,
1115            10.5_f32,
1116            3.75_f32,
1117            50.25_f32,
1118            16.5_f32,
1119            255.11_f32,
1120            1000.158_f32,
1121            575.575_f32,
1122        ];
1123        let b_array = [
1124            -178.125_f32,
1125            -10.5_f32,
1126            -3.75_f32,
1127            -50.25_f32,
1128            -16.5_f32,
1129            -255.11_f32,
1130            -1000.158_f32,
1131            -575.575_f32,
1132            -178.125_f32,
1133            -10.5_f32,
1134            -3.75_f32,
1135            -50.25_f32,
1136            -16.5_f32,
1137            -255.11_f32,
1138            -1000.158_f32,
1139            -575.575_f32,
1140        ];
1141        let a = f32x16::from_array(a_array).as_m512();
1142        let b = f32x16::from_array(b_array).as_m512();
1143        let k: __mmask32 = 0xffffffff;
1144        let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
1145        let result = *c.as_u16x32().as_array();
1146        #[rustfmt::skip]
1147        let expected_result: [u16; 32] = [
1148            0b1_10000110_0110010,
1149            0b1_10000010_0101000,
1150            0b1_10000000_1110000,
1151            0b1_10000100_1001001,
1152            0b1_10000011_0000100,
1153            0b1_10000110_1111111,
1154            0b1_10001000_1111010,
1155            0b1_10001000_0010000,
1156            0b1_10000110_0110010,
1157            0b1_10000010_0101000,
1158            0b1_10000000_1110000,
1159            0b1_10000100_1001001,
1160            0b1_10000011_0000100,
1161            0b1_10000110_1111111,
1162            0b1_10001000_1111010,
1163            0b1_10001000_0010000,
1164            0b0_10000110_0110010,
1165            0b0_10000010_0101000,
1166            0b0_10000000_1110000,
1167            0b0_10000100_1001001,
1168            0b0_10000011_0000100,
1169            0b0_10000110_1111111,
1170            0b0_10001000_1111010,
1171            0b0_10001000_0010000,
1172            0b0_10000110_0110010,
1173            0b0_10000010_0101000,
1174            0b0_10000000_1110000,
1175            0b0_10000100_1001001,
1176            0b0_10000011_0000100,
1177            0b0_10000110_1111111,
1178            0b0_10001000_1111010,
1179            0b0_10001000_0010000,
1180        ];
1181        assert_eq!(result, expected_result);
1182        let k: __mmask32 = 0b1100_1010_1001_0110_1010_0011_0101_0110;
1183        let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
1184        let result = *c.as_u16x32().as_array();
1185        #[rustfmt::skip]
1186        let expected_result: [u16; 32] = [
1187            0,
1188            0b1_10000010_0101000,
1189            0b1_10000000_1110000,
1190            0,
1191            0b1_10000011_0000100,
1192            0,
1193            0b1_10001000_1111010,
1194            0,
1195            0b1_10000110_0110010,
1196            0b1_10000010_0101000,
1197            0,
1198            0,
1199            0,
1200            0b1_10000110_1111111,
1201            0,
1202            0b1_10001000_0010000,
1203            0,
1204            0b0_10000010_0101000,
1205            0b0_10000000_1110000,
1206            0,
1207            0b0_10000011_0000100,
1208            0,
1209            0,
1210            0b0_10001000_0010000,
1211            0,
1212            0b0_10000010_0101000,
1213            0,
1214            0b0_10000100_1001001,
1215            0,
1216            0,
1217            0b0_10001000_1111010,
1218            0b0_10001000_0010000,
1219        ];
1220        assert_eq!(result, expected_result);
1221    }
1222
1223    #[simd_test(enable = "avx512bf16,avx512vl")]
1224    fn test_mm256_cvtneps_pbh() {
1225        #[rustfmt::skip]
1226        let a_array = [
1227            178.125_f32,
1228            10.5_f32,
1229            3.75_f32,
1230            50.25_f32,
1231            16.5_f32,
1232            255.11_f32,
1233            1000.158_f32,
1234            575.575_f32,
1235        ];
1236        let a = f32x8::from_array(a_array).as_m256();
1237        let c: __m128bh = _mm256_cvtneps_pbh(a);
1238        let result = *c.as_u16x8().as_array();
1239        #[rustfmt::skip]
1240        let expected_result: [u16; 8] = [
1241            0b0_10000110_0110010,
1242            0b0_10000010_0101000,
1243            0b0_10000000_1110000,
1244            0b0_10000100_1001001,
1245            0b0_10000011_0000100,
1246            0b0_10000110_1111111,
1247            0b0_10001000_1111010,
1248            0b0_10001000_0010000,
1249        ];
1250        assert_eq!(result, expected_result);
1251    }
1252
1253    #[simd_test(enable = "avx512bf16,avx512vl")]
1254    fn test_mm256_mask_cvtneps_pbh() {
1255        #[rustfmt::skip]
1256        let a_array = [
1257            178.125_f32,
1258            10.5_f32,
1259            3.75_f32,
1260            50.25_f32,
1261            16.5_f32,
1262            255.11_f32,
1263            1000.158_f32,
1264            575.575_f32,
1265        ];
1266        let src_array: [u16; 8] = [
1267            0b1_10000110_0110010,
1268            0b1_10000010_0101000,
1269            0b1_10000000_1110000,
1270            0b1_10000100_1001001,
1271            0b1_10000011_0000100,
1272            0b1_10000110_1111111,
1273            0b1_10001000_1111010,
1274            0b1_10001000_0010000,
1275        ];
1276        let src = u16x8::from_array(src_array).as_m128bh();
1277        let a = f32x8::from_array(a_array).as_m256();
1278        let k: __mmask8 = 0xff;
1279        let b = _mm256_mask_cvtneps_pbh(src, k, a);
1280        let result = *b.as_u16x8().as_array();
1281        #[rustfmt::skip]
1282        let expected_result: [u16; 8] = [
1283            0b0_10000110_0110010,
1284            0b0_10000010_0101000,
1285            0b0_10000000_1110000,
1286            0b0_10000100_1001001,
1287            0b0_10000011_0000100,
1288            0b0_10000110_1111111,
1289            0b0_10001000_1111010,
1290            0b0_10001000_0010000,
1291        ];
1292        assert_eq!(result, expected_result);
1293        let k: __mmask8 = 0x0;
1294        let b: __m128bh = _mm256_mask_cvtneps_pbh(src, k, a);
1295        let result = *b.as_u16x8().as_array();
1296        let expected_result: [u16; 8] = src_array;
1297        assert_eq!(result, expected_result);
1298    }
1299
1300    #[simd_test(enable = "avx512bf16,avx512vl")]
1301    fn test_mm256_maskz_cvtneps_pbh() {
1302        #[rustfmt::skip]
1303        let a_array = [
1304            178.125_f32,
1305            10.5_f32,
1306            3.75_f32,
1307            50.25_f32,
1308            16.5_f32,
1309            255.11_f32,
1310            1000.158_f32,
1311            575.575_f32,
1312        ];
1313        let a = f32x8::from_array(a_array).as_m256();
1314        let k: __mmask8 = 0xff;
1315        let b = _mm256_maskz_cvtneps_pbh(k, a);
1316        let result = *b.as_u16x8().as_array();
1317        #[rustfmt::skip]
1318        let expected_result: [u16; 8] = [
1319            0b0_10000110_0110010,
1320            0b0_10000010_0101000,
1321            0b0_10000000_1110000,
1322            0b0_10000100_1001001,
1323            0b0_10000011_0000100,
1324            0b0_10000110_1111111,
1325            0b0_10001000_1111010,
1326            0b0_10001000_0010000,
1327        ];
1328        assert_eq!(result, expected_result);
1329        let k: __mmask8 = 0x6;
1330        let b: __m128bh = _mm256_maskz_cvtneps_pbh(k, a);
1331        let result = *b.as_u16x8().as_array();
1332        let expected_result: [u16; 8] =
1333            [0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0, 0, 0, 0, 0];
1334        assert_eq!(result, expected_result);
1335    }
1336
1337    #[simd_test(enable = "avx512bf16,avx512f")]
1338    fn test_mm512_cvtneps_pbh() {
1339        #[rustfmt::skip]
1340        let a_array = [
1341            178.125_f32,
1342            10.5_f32,
1343            3.75_f32,
1344            50.25_f32,
1345            16.5_f32,
1346            255.11_f32,
1347            1000.158_f32,
1348            575.575_f32,
1349            178.125_f32,
1350            10.5_f32,
1351            3.75_f32,
1352            50.25_f32,
1353            16.5_f32,
1354            255.11_f32,
1355            1000.158_f32,
1356            575.575_f32,
1357        ];
1358        let a = f32x16::from_array(a_array).as_m512();
1359        let c: __m256bh = _mm512_cvtneps_pbh(a);
1360        let result = *c.as_u16x16().as_array();
1361        #[rustfmt::skip]
1362        let expected_result: [u16; 16] = [
1363            0b0_10000110_0110010,
1364            0b0_10000010_0101000,
1365            0b0_10000000_1110000,
1366            0b0_10000100_1001001,
1367            0b0_10000011_0000100,
1368            0b0_10000110_1111111,
1369            0b0_10001000_1111010,
1370            0b0_10001000_0010000,
1371            0b0_10000110_0110010,
1372            0b0_10000010_0101000,
1373            0b0_10000000_1110000,
1374            0b0_10000100_1001001,
1375            0b0_10000011_0000100,
1376            0b0_10000110_1111111,
1377            0b0_10001000_1111010,
1378            0b0_10001000_0010000,
1379        ];
1380        assert_eq!(result, expected_result);
1381    }
1382
1383    #[simd_test(enable = "avx512bf16,avx512f")]
1384    fn test_mm512_mask_cvtneps_pbh() {
1385        #[rustfmt::skip]
1386        let a_array = [
1387            178.125_f32,
1388            10.5_f32,
1389            3.75_f32,
1390            50.25_f32,
1391            16.5_f32,
1392            255.11_f32,
1393            1000.158_f32,
1394            575.575_f32,
1395            178.125_f32,
1396            10.5_f32,
1397            3.75_f32,
1398            50.25_f32,
1399            16.5_f32,
1400            255.11_f32,
1401            1000.158_f32,
1402            575.575_f32,
1403        ];
1404        let src_array: [u16; 16] = [
1405            0b1_10000110_0110010,
1406            0b1_10000010_0101000,
1407            0b1_10000000_1110000,
1408            0b1_10000100_1001001,
1409            0b1_10000011_0000100,
1410            0b1_10000110_1111111,
1411            0b1_10001000_1111010,
1412            0b1_10001000_0010000,
1413            0b1_10000110_0110010,
1414            0b1_10000010_0101000,
1415            0b1_10000000_1110000,
1416            0b1_10000100_1001001,
1417            0b1_10000011_0000100,
1418            0b1_10000110_1111111,
1419            0b1_10001000_1111010,
1420            0b1_10001000_0010000,
1421        ];
1422        let src = u16x16::from_array(src_array).as_m256bh();
1423        let a = f32x16::from_array(a_array).as_m512();
1424        let k: __mmask16 = 0xffff;
1425        let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
1426        let result = *c.as_u16x16().as_array();
1427        #[rustfmt::skip]
1428        let expected_result: [u16; 16] = [
1429            0b0_10000110_0110010,
1430            0b0_10000010_0101000,
1431            0b0_10000000_1110000,
1432            0b0_10000100_1001001,
1433            0b0_10000011_0000100,
1434            0b0_10000110_1111111,
1435            0b0_10001000_1111010,
1436            0b0_10001000_0010000,
1437            0b0_10000110_0110010,
1438            0b0_10000010_0101000,
1439            0b0_10000000_1110000,
1440            0b0_10000100_1001001,
1441            0b0_10000011_0000100,
1442            0b0_10000110_1111111,
1443            0b0_10001000_1111010,
1444            0b0_10001000_0010000,
1445        ];
1446        assert_eq!(result, expected_result);
1447        let k: __mmask16 = 0;
1448        let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
1449        let result = *c.as_u16x16().as_array();
1450        let expected_result = src_array;
1451        assert_eq!(result, expected_result);
1452    }
1453
1454    #[simd_test(enable = "avx512bf16,avx512f")]
1455    fn test_mm512_maskz_cvtneps_pbh() {
1456        #[rustfmt::skip]
1457        let a_array = [
1458            178.125_f32,
1459            10.5_f32,
1460            3.75_f32,
1461            50.25_f32,
1462            16.5_f32,
1463            255.11_f32,
1464            1000.158_f32,
1465            575.575_f32,
1466            178.125_f32,
1467            10.5_f32,
1468            3.75_f32,
1469            50.25_f32,
1470            16.5_f32,
1471            255.11_f32,
1472            1000.158_f32,
1473            575.575_f32,
1474        ];
1475        let a = f32x16::from_array(a_array).as_m512();
1476        let k: __mmask16 = 0xffff;
1477        let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
1478        let result = *c.as_u16x16().as_array();
1479        #[rustfmt::skip]
1480        let expected_result: [u16; 16] = [
1481            0b0_10000110_0110010,
1482            0b0_10000010_0101000,
1483            0b0_10000000_1110000,
1484            0b0_10000100_1001001,
1485            0b0_10000011_0000100,
1486            0b0_10000110_1111111,
1487            0b0_10001000_1111010,
1488            0b0_10001000_0010000,
1489            0b0_10000110_0110010,
1490            0b0_10000010_0101000,
1491            0b0_10000000_1110000,
1492            0b0_10000100_1001001,
1493            0b0_10000011_0000100,
1494            0b0_10000110_1111111,
1495            0b0_10001000_1111010,
1496            0b0_10001000_0010000,
1497        ];
1498        assert_eq!(result, expected_result);
1499        let k: __mmask16 = 0x653a;
1500        let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
1501        let result = *c.as_u16x16().as_array();
1502        #[rustfmt::skip]
1503        let expected_result: [u16; 16] = [
1504            0,
1505            0b0_10000010_0101000,
1506            0,
1507            0b0_10000100_1001001,
1508            0b0_10000011_0000100,
1509            0b0_10000110_1111111,
1510            0,
1511            0,
1512            0b0_10000110_0110010,
1513            0,
1514            0b0_10000000_1110000,
1515            0,
1516            0,
1517            0b0_10000110_1111111,
1518            0b0_10001000_1111010,
1519            0,
1520        ];
1521        assert_eq!(result, expected_result);
1522    }
1523
1524    #[simd_test(enable = "avx512bf16,avx512vl")]
1525    fn test_mm_dpbf16_ps() {
1526        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
1527        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
1528        let a1 = f32x4::from_array(a_array).as_m128();
1529        let b1 = f32x4::from_array(b_array).as_m128();
1530        let src = f32x4::from_array([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]).as_m128();
1531        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
1532        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
1533        let c: __m128 = _mm_dpbf16_ps(src, a, b);
1534        let result = *c.as_f32x4().as_array();
1535        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
1536        assert_eq!(result, expected_result);
1537    }
1538
1539    #[simd_test(enable = "avx512bf16,avx512vl")]
1540    fn test_mm_mask_dpbf16_ps() {
1541        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
1542        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
1543        let a1 = f32x4::from_array(a_array).as_m128();
1544        let b1 = f32x4::from_array(b_array).as_m128();
1545        let k: __mmask8 = 0xf3;
1546        let src = f32x4::from_array([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]).as_m128();
1547        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
1548        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
1549        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
1550        let result = *c.as_f32x4().as_array();
1551        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32];
1552        assert_eq!(result, expected_result);
1553        let k: __mmask8 = 0xff;
1554        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
1555        let result = *c.as_f32x4().as_array();
1556        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
1557        assert_eq!(result, expected_result);
1558        let k: __mmask8 = 0;
1559        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
1560        let result = *c.as_f32x4().as_array();
1561        let expected_result: [f32; 4] = [1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32];
1562        assert_eq!(result, expected_result);
1563    }
1564
1565    #[simd_test(enable = "avx512bf16,avx512vl")]
1566    fn test_mm_maskz_dpbf16_ps() {
1567        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
1568        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
1569        let a1 = f32x4::from_array(a_array).as_m128();
1570        let b1 = f32x4::from_array(b_array).as_m128();
1571        let k: __mmask8 = 0xf3;
1572        let src = f32x4::from_array([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]).as_m128();
1573        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
1574        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
1575        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
1576        let result = *c.as_f32x4().as_array();
1577        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 0.0, 0.0];
1578        assert_eq!(result, expected_result);
1579        let k: __mmask8 = 0xff;
1580        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
1581        let result = *c.as_f32x4().as_array();
1582        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
1583        assert_eq!(result, expected_result);
1584        let k: __mmask8 = 0;
1585        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
1586        let result = *c.as_f32x4().as_array();
1587        let expected_result: [f32; 4] = [0.0, 0.0, 0.0, 0.0];
1588        assert_eq!(result, expected_result);
1589    }
1590
1591    #[simd_test(enable = "avx512bf16,avx512vl")]
1592    fn test_mm256_dpbf16_ps() {
1593        #[rustfmt::skip]
1594        let a_array = [
1595            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1596        ];
1597        let b_array = [
1598            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1599        ];
1600        let a1 = f32x8::from_array(a_array).as_m256();
1601        let b1 = f32x8::from_array(b_array).as_m256();
1602        #[rustfmt::skip]
1603        let src = f32x8::from_array([
1604            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1605        ]).as_m256();
1606        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
1607        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
1608        let c: __m256 = _mm256_dpbf16_ps(src, a, b);
1609        let result = *c.as_f32x8().as_array();
1610        #[rustfmt::skip]
1611        let expected_result: [f32; 8] = [
1612            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1613        ];
1614        assert_eq!(result, expected_result);
1615    }
1616
1617    #[simd_test(enable = "avx512bf16,avx512vl")]
1618    fn test_mm256_mask_dpbf16_ps() {
1619        #[rustfmt::skip]
1620        let a_array = [
1621            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1622        ];
1623        let b_array = [
1624            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1625        ];
1626        let a1 = f32x8::from_array(a_array).as_m256();
1627        let b1 = f32x8::from_array(b_array).as_m256();
1628        let k: __mmask8 = 0x33;
1629        #[rustfmt::skip]
1630        let src = f32x8::from_array([
1631            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1632        ]).as_m256();
1633        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
1634        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
1635        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
1636        let result = *c.as_f32x8().as_array();
1637        #[rustfmt::skip]
1638        let expected_result: [f32; 8] = [
1639            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
1640        ];
1641        assert_eq!(result, expected_result);
1642        let k: __mmask8 = 0xff;
1643        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
1644        let result = *c.as_f32x8().as_array();
1645        #[rustfmt::skip]
1646        let expected_result: [f32; 8] = [
1647            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1648        ];
1649        assert_eq!(result, expected_result);
1650        let k: __mmask8 = 0;
1651        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
1652        let result = *c.as_f32x8().as_array();
1653        #[rustfmt::skip]
1654        let expected_result: [f32; 8] = [
1655            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1656        ];
1657        assert_eq!(result, expected_result);
1658    }
1659
1660    #[simd_test(enable = "avx512bf16,avx512vl")]
1661    fn test_mm256_maskz_dpbf16_ps() {
1662        #[rustfmt::skip]
1663        let a_array = [
1664            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1665        ];
1666        let b_array = [
1667            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1668        ];
1669        let a1 = f32x8::from_array(a_array).as_m256();
1670        let b1 = f32x8::from_array(b_array).as_m256();
1671        let k: __mmask8 = 0x33;
1672        #[rustfmt::skip]
1673        let src = f32x8::from_array([
1674            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1675        ]).as_m256();
1676        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
1677        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
1678        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
1679        let result = *c.as_f32x8().as_array();
1680        #[rustfmt::skip]
1681        let expected_result: [f32; 8] = [
1682            -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
1683        ];
1684        assert_eq!(result, expected_result);
1685        let k: __mmask8 = 0xff;
1686        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
1687        let result = *c.as_f32x8().as_array();
1688        #[rustfmt::skip]
1689        let expected_result: [f32; 8] = [
1690            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1691        ];
1692        assert_eq!(result, expected_result);
1693        let k: __mmask8 = 0;
1694        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
1695        let result = *c.as_f32x8().as_array();
1696        let expected_result: [f32; 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0];
1697        assert_eq!(result, expected_result);
1698    }
1699
1700    #[simd_test(enable = "avx512bf16,avx512f")]
1701    fn test_mm512_dpbf16_ps() {
1702        #[rustfmt::skip]
1703        let a_array = [
1704            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1705            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1706        ];
1707        let b_array = [
1708            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1709            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1710        ];
1711        let a1 = f32x16::from_array(a_array).as_m512();
1712        let b1 = f32x16::from_array(b_array).as_m512();
1713        let src = f32x16::from_array([
1714            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
1715            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1716        ])
1717        .as_m512();
1718        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
1719        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
1720        let c: __m512 = _mm512_dpbf16_ps(src, a, b);
1721        let result = *c.as_f32x16().as_array();
1722        #[rustfmt::skip]
1723        let expected_result: [f32; 16] = [
1724            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1725            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1726        ];
1727        assert_eq!(result, expected_result);
1728    }
1729
1730    #[simd_test(enable = "avx512bf16,avx512f")]
1731    fn test_mm512_mask_dpbf16_ps() {
1732        #[rustfmt::skip]
1733        let a_array = [
1734            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1735            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1736        ];
1737        let b_array = [
1738            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1739            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1740        ];
1741        let a1 = f32x16::from_array(a_array).as_m512();
1742        let b1 = f32x16::from_array(b_array).as_m512();
1743        let k: __mmask16 = 0x3333;
1744        #[rustfmt::skip]
1745        let src = f32x16::from_array([
1746            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
1747            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1748        ]).as_m512();
1749        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
1750        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
1751        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
1752        let result = *c.as_f32x16().as_array();
1753        #[rustfmt::skip]
1754        let expected_result: [f32; 16] = [
1755            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
1756            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
1757        ];
1758        assert_eq!(result, expected_result);
1759        let k: __mmask16 = 0xffff;
1760        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
1761        let result = *c.as_f32x16().as_array();
1762        #[rustfmt::skip]
1763        let expected_result: [f32; 16] = [
1764            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1765            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1766        ];
1767        assert_eq!(result, expected_result);
1768        let k: __mmask16 = 0;
1769        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
1770        let result = *c.as_f32x16().as_array();
1771        #[rustfmt::skip]
1772        let expected_result: [f32; 16] = [
1773            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
1774            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1775        ];
1776        assert_eq!(result, expected_result);
1777    }
1778
1779    #[simd_test(enable = "avx512bf16,avx512f")]
1780    fn test_mm512_maskz_dpbf16_ps() {
1781        #[rustfmt::skip]
1782        let a_array = [
1783            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1784            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1785        ];
1786        let b_array = [
1787            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1788            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1789        ];
1790        let a1 = f32x16::from_array(a_array).as_m512();
1791        let b1 = f32x16::from_array(b_array).as_m512();
1792        let k: __mmask16 = 0x3333;
1793        #[rustfmt::skip]
1794        let src = f32x16::from_array([
1795            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
1796            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1797        ]).as_m512();
1798        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
1799        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
1800        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
1801        let result = *c.as_f32x16().as_array();
1802        #[rustfmt::skip]
1803        let expected_result: [f32; 16] = [
1804            -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32,
1805            0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
1806        ];
1807        assert_eq!(result, expected_result);
1808        let k: __mmask16 = 0xffff;
1809        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
1810        let result = *c.as_f32x16().as_array();
1811        #[rustfmt::skip]
1812        let expected_result: [f32; 16] = [
1813            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1814            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1815        ];
1816        assert_eq!(result, expected_result);
1817        let k: __mmask16 = 0;
1818        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
1819        let result = *c.as_f32x16().as_array();
1820        #[rustfmt::skip]
1821        let expected_result: [f32; 16] = [
1822            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1823        ];
1824        assert_eq!(result, expected_result);
1825    }
1826
1827    const BF16_ONE: u16 = 0b0_01111111_0000000;
1828    const BF16_TWO: u16 = 0b0_10000000_0000000;
1829    const BF16_THREE: u16 = 0b0_10000000_1000000;
1830    const BF16_FOUR: u16 = 0b0_10000001_0000000;
1831    const BF16_FIVE: u16 = 0b0_10000001_0100000;
1832    const BF16_SIX: u16 = 0b0_10000001_1000000;
1833    const BF16_SEVEN: u16 = 0b0_10000001_1100000;
1834    const BF16_EIGHT: u16 = 0b0_10000010_0000000;
1835
1836    #[simd_test(enable = "avx512bf16")]
1837    fn test_mm512_cvtpbh_ps() {
1838        let a = __m256bh([
1839            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1840            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1841        ]);
1842        let r = _mm512_cvtpbh_ps(a);
1843        let e = _mm512_setr_ps(
1844            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0,
1845        );
1846        assert_eq_m512(r, e);
1847    }
1848
1849    #[simd_test(enable = "avx512bf16")]
1850    fn test_mm512_mask_cvtpbh_ps() {
1851        let a = __m256bh([
1852            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1853            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1854        ]);
1855        let src = _mm512_setr_ps(
1856            9., 10., 11., 12., 13., 14., 15., 16., 9., 10., 11., 12., 13., 14., 15., 16.,
1857        );
1858        let k = 0b1010_1010_1010_1010;
1859        let r = _mm512_mask_cvtpbh_ps(src, k, a);
1860        let e = _mm512_setr_ps(
1861            9., 2., 11., 4., 13., 6., 15., 8., 9., 2., 11., 4., 13., 6., 15., 8.,
1862        );
1863        assert_eq_m512(r, e);
1864    }
1865
1866    #[simd_test(enable = "avx512bf16")]
1867    fn test_mm512_maskz_cvtpbh_ps() {
1868        let a = __m256bh([
1869            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1870            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1871        ]);
1872        let k = 0b1010_1010_1010_1010;
1873        let r = _mm512_maskz_cvtpbh_ps(k, a);
1874        let e = _mm512_setr_ps(
1875            0., 2., 0., 4., 0., 6., 0., 8., 0., 2., 0., 4., 0., 6., 0., 8.,
1876        );
1877        assert_eq_m512(r, e);
1878    }
1879
1880    #[simd_test(enable = "avx512bf16,avx512vl")]
1881    fn test_mm256_cvtpbh_ps() {
1882        let a = __m128bh([
1883            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1884        ]);
1885        let r = _mm256_cvtpbh_ps(a);
1886        let e = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
1887        assert_eq_m256(r, e);
1888    }
1889
1890    #[simd_test(enable = "avx512bf16,avx512vl")]
1891    fn test_mm256_mask_cvtpbh_ps() {
1892        let a = __m128bh([
1893            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1894        ]);
1895        let src = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
1896        let k = 0b1010_1010;
1897        let r = _mm256_mask_cvtpbh_ps(src, k, a);
1898        let e = _mm256_setr_ps(9., 2., 11., 4., 13., 6., 15., 8.);
1899        assert_eq_m256(r, e);
1900    }
1901
1902    #[simd_test(enable = "avx512bf16,avx512vl")]
1903    fn test_mm256_maskz_cvtpbh_ps() {
1904        let a = __m128bh([
1905            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1906        ]);
1907        let k = 0b1010_1010;
1908        let r = _mm256_maskz_cvtpbh_ps(k, a);
1909        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
1910        assert_eq_m256(r, e);
1911    }
1912
1913    #[simd_test(enable = "avx512bf16,avx512vl")]
1914    fn test_mm_cvtpbh_ps() {
1915        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
1916        let r = _mm_cvtpbh_ps(a);
1917        let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1918        assert_eq_m128(r, e);
1919    }
1920
1921    #[simd_test(enable = "avx512bf16,avx512vl")]
1922    fn test_mm_mask_cvtpbh_ps() {
1923        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
1924        let src = _mm_setr_ps(9., 10., 11., 12.);
1925        let k = 0b1010;
1926        let r = _mm_mask_cvtpbh_ps(src, k, a);
1927        let e = _mm_setr_ps(9., 2., 11., 4.);
1928        assert_eq_m128(r, e);
1929    }
1930
1931    #[simd_test(enable = "avx512bf16,avx512vl")]
1932    fn test_mm_maskz_cvtpbh_ps() {
1933        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
1934        let k = 0b1010;
1935        let r = _mm_maskz_cvtpbh_ps(k, a);
1936        let e = _mm_setr_ps(0., 2., 0., 4.);
1937        assert_eq_m128(r, e);
1938    }
1939
1940    #[simd_test(enable = "avx512bf16")]
1941    fn test_mm_cvtsbh_ss() {
1942        let r = _mm_cvtsbh_ss(bf16::from_bits(BF16_ONE));
1943        assert_eq!(r, 1.);
1944    }
1945
1946    #[simd_test(enable = "avx512bf16,avx512vl")]
1947    fn test_mm_cvtneps_pbh() {
1948        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1949        let r: u16x4 = unsafe { transmute_copy(&_mm_cvtneps_pbh(a)) };
1950        let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
1951        assert_eq!(r, e);
1952    }
1953
1954    #[simd_test(enable = "avx512bf16,avx512vl")]
1955    fn test_mm_mask_cvtneps_pbh() {
1956        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1957        let src = __m128bh([5, 6, 7, 8, !0, !0, !0, !0]);
1958        let k = 0b1010;
1959        let r: u16x4 = unsafe { transmute_copy(&_mm_mask_cvtneps_pbh(src, k, a)) };
1960        let e = u16x4::new(5, BF16_TWO, 7, BF16_FOUR);
1961        assert_eq!(r, e);
1962    }
1963
1964    #[simd_test(enable = "avx512bf16,avx512vl")]
1965    fn test_mm_maskz_cvtneps_pbh() {
1966        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1967        let k = 0b1010;
1968        let r: u16x4 = unsafe { transmute_copy(&_mm_maskz_cvtneps_pbh(k, a)) };
1969        let e = u16x4::new(0, BF16_TWO, 0, BF16_FOUR);
1970        assert_eq!(r, e);
1971    }
1972
1973    #[simd_test(enable = "avx512bf16,avx512vl")]
1974    fn test_mm_cvtness_sbh() {
1975        let r = _mm_cvtness_sbh(1.);
1976        assert_eq!(r.to_bits(), BF16_ONE);
1977    }
1978}
core/stdarch/crates/core_arch/src/x86/avx512bf16.rs

core/stdarch/crates/core_arch/src/x86/
avx512bf16.rs