core/stdarch/crates/core_arch/src/x86/
avx2.rs

1//! Advanced Vector Extensions 2 (AVX)
2//!
3//! AVX2 expands most AVX commands to 256-bit wide vector registers and
4//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
5//!
6//! The references are:
7//!
8//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
9//!   Instruction Set Reference, A-Z][intel64_ref].
10//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
11//!   System Instructions][amd64_ref].
12//!
13//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
14//! overview of the instructions available.
15//!
16//! [intel64_ref]: https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
17//! [amd64_ref]: https://docs.amd.com/v/u/en-US/24594_3.37
18//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
19//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21use crate::core_arch::{simd::*, x86::*};
22use crate::intrinsics::simd::*;
23
24#[cfg(test)]
25use stdarch_test::assert_instr;
26
27/// Computes the absolute values of packed 32-bit integers in `a`.
28///
29/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
30#[inline]
31#[target_feature(enable = "avx2")]
32#[cfg_attr(test, assert_instr(vpabsd))]
33#[stable(feature = "simd_x86", since = "1.27.0")]
34#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
35pub const fn _mm256_abs_epi32(a: __m256i) -> __m256i {
36    unsafe {
37        let a = a.as_i32x8();
38        let r = simd_select::<m32x8, _>(simd_lt(a, i32x8::ZERO), simd_neg(a), a);
39        transmute(r)
40    }
41}
42
43/// Computes the absolute values of packed 16-bit integers in `a`.
44///
45/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
46#[inline]
47#[target_feature(enable = "avx2")]
48#[cfg_attr(test, assert_instr(vpabsw))]
49#[stable(feature = "simd_x86", since = "1.27.0")]
50#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
51pub const fn _mm256_abs_epi16(a: __m256i) -> __m256i {
52    unsafe {
53        let a = a.as_i16x16();
54        let r = simd_select::<m16x16, _>(simd_lt(a, i16x16::ZERO), simd_neg(a), a);
55        transmute(r)
56    }
57}
58
59/// Computes the absolute values of packed 8-bit integers in `a`.
60///
61/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
62#[inline]
63#[target_feature(enable = "avx2")]
64#[cfg_attr(test, assert_instr(vpabsb))]
65#[stable(feature = "simd_x86", since = "1.27.0")]
66#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
67pub const fn _mm256_abs_epi8(a: __m256i) -> __m256i {
68    unsafe {
69        let a = a.as_i8x32();
70        let r = simd_select::<m8x32, _>(simd_lt(a, i8x32::ZERO), simd_neg(a), a);
71        transmute(r)
72    }
73}
74
75/// Adds packed 64-bit integers in `a` and `b`.
76///
77/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
78#[inline]
79#[target_feature(enable = "avx2")]
80#[cfg_attr(test, assert_instr(vpaddq))]
81#[stable(feature = "simd_x86", since = "1.27.0")]
82#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
83pub const fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
84    unsafe { transmute(simd_add(a.as_i64x4(), b.as_i64x4())) }
85}
86
87/// Adds packed 32-bit integers in `a` and `b`.
88///
89/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
90#[inline]
91#[target_feature(enable = "avx2")]
92#[cfg_attr(test, assert_instr(vpaddd))]
93#[stable(feature = "simd_x86", since = "1.27.0")]
94#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
95pub const fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
96    unsafe { transmute(simd_add(a.as_i32x8(), b.as_i32x8())) }
97}
98
99/// Adds packed 16-bit integers in `a` and `b`.
100///
101/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
102#[inline]
103#[target_feature(enable = "avx2")]
104#[cfg_attr(test, assert_instr(vpaddw))]
105#[stable(feature = "simd_x86", since = "1.27.0")]
106#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
107pub const fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
108    unsafe { transmute(simd_add(a.as_i16x16(), b.as_i16x16())) }
109}
110
111/// Adds packed 8-bit integers in `a` and `b`.
112///
113/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
114#[inline]
115#[target_feature(enable = "avx2")]
116#[cfg_attr(test, assert_instr(vpaddb))]
117#[stable(feature = "simd_x86", since = "1.27.0")]
118#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
119pub const fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
120    unsafe { transmute(simd_add(a.as_i8x32(), b.as_i8x32())) }
121}
122
123/// Adds packed 8-bit integers in `a` and `b` using saturation.
124///
125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
126#[inline]
127#[target_feature(enable = "avx2")]
128#[cfg_attr(test, assert_instr(vpaddsb))]
129#[stable(feature = "simd_x86", since = "1.27.0")]
130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
131pub const fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
132    unsafe { transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32())) }
133}
134
135/// Adds packed 16-bit integers in `a` and `b` using saturation.
136///
137/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
138#[inline]
139#[target_feature(enable = "avx2")]
140#[cfg_attr(test, assert_instr(vpaddsw))]
141#[stable(feature = "simd_x86", since = "1.27.0")]
142#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
143pub const fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
144    unsafe { transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16())) }
145}
146
147/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
148///
149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
150#[inline]
151#[target_feature(enable = "avx2")]
152#[cfg_attr(test, assert_instr(vpaddusb))]
153#[stable(feature = "simd_x86", since = "1.27.0")]
154#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
155pub const fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
156    unsafe { transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32())) }
157}
158
159/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
160///
161/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
162#[inline]
163#[target_feature(enable = "avx2")]
164#[cfg_attr(test, assert_instr(vpaddusw))]
165#[stable(feature = "simd_x86", since = "1.27.0")]
166#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
167pub const fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
168    unsafe { transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16())) }
169}
170
171/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
172/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
173///
174/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
175#[inline]
176#[target_feature(enable = "avx2")]
177#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 7))]
178#[rustc_legacy_const_generics(2)]
179#[stable(feature = "simd_x86", since = "1.27.0")]
180#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
181pub const fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
182    static_assert_uimm_bits!(IMM8, 8);
183
184    // If palignr is shifting the pair of vectors more than the size of two
185    // lanes, emit zero.
186    if IMM8 >= 32 {
187        return _mm256_setzero_si256();
188    }
189    // If palignr is shifting the pair of input vectors more than one lane,
190    // but less than two lanes, convert to shifting in zeroes.
191    let (a, b) = if IMM8 > 16 {
192        (_mm256_setzero_si256(), a)
193    } else {
194        (a, b)
195    };
196    unsafe {
197        if IMM8 == 16 {
198            return transmute(a);
199        }
200    }
201    const fn mask(shift: u32, i: u32) -> u32 {
202        let shift = shift % 16;
203        let mod_i = i % 16;
204        if mod_i < (16 - shift) {
205            i + shift
206        } else {
207            i + 16 + shift
208        }
209    }
210
211    unsafe {
212        let r: i8x32 = simd_shuffle!(
213            b.as_i8x32(),
214            a.as_i8x32(),
215            [
216                mask(IMM8 as u32, 0),
217                mask(IMM8 as u32, 1),
218                mask(IMM8 as u32, 2),
219                mask(IMM8 as u32, 3),
220                mask(IMM8 as u32, 4),
221                mask(IMM8 as u32, 5),
222                mask(IMM8 as u32, 6),
223                mask(IMM8 as u32, 7),
224                mask(IMM8 as u32, 8),
225                mask(IMM8 as u32, 9),
226                mask(IMM8 as u32, 10),
227                mask(IMM8 as u32, 11),
228                mask(IMM8 as u32, 12),
229                mask(IMM8 as u32, 13),
230                mask(IMM8 as u32, 14),
231                mask(IMM8 as u32, 15),
232                mask(IMM8 as u32, 16),
233                mask(IMM8 as u32, 17),
234                mask(IMM8 as u32, 18),
235                mask(IMM8 as u32, 19),
236                mask(IMM8 as u32, 20),
237                mask(IMM8 as u32, 21),
238                mask(IMM8 as u32, 22),
239                mask(IMM8 as u32, 23),
240                mask(IMM8 as u32, 24),
241                mask(IMM8 as u32, 25),
242                mask(IMM8 as u32, 26),
243                mask(IMM8 as u32, 27),
244                mask(IMM8 as u32, 28),
245                mask(IMM8 as u32, 29),
246                mask(IMM8 as u32, 30),
247                mask(IMM8 as u32, 31),
248            ],
249        );
250        transmute(r)
251    }
252}
253
254/// Computes the bitwise AND of 256 bits (representing integer data)
255/// in `a` and `b`.
256///
257/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
258#[inline]
259#[target_feature(enable = "avx2")]
260#[cfg_attr(test, assert_instr(vandps))]
261#[stable(feature = "simd_x86", since = "1.27.0")]
262#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
263pub const fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
264    unsafe { transmute(simd_and(a.as_i64x4(), b.as_i64x4())) }
265}
266
267/// Computes the bitwise NOT of 256 bits (representing integer data)
268/// in `a` and then AND with `b`.
269///
270/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
271#[inline]
272#[target_feature(enable = "avx2")]
273#[cfg_attr(test, assert_instr(vandnps))]
274#[stable(feature = "simd_x86", since = "1.27.0")]
275#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
276pub const fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
277    unsafe {
278        let all_ones = _mm256_set1_epi8(-1);
279        transmute(simd_and(
280            simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
281            b.as_i64x4(),
282        ))
283    }
284}
285
286/// Averages packed unsigned 16-bit integers in `a` and `b`.
287///
288/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
289#[inline]
290#[target_feature(enable = "avx2")]
291#[cfg_attr(test, assert_instr(vpavgw))]
292#[stable(feature = "simd_x86", since = "1.27.0")]
293#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
294pub const fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
295    unsafe {
296        let a = simd_cast::<_, u32x16>(a.as_u16x16());
297        let b = simd_cast::<_, u32x16>(b.as_u16x16());
298        let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
299        transmute(simd_cast::<_, u16x16>(r))
300    }
301}
302
303/// Averages packed unsigned 8-bit integers in `a` and `b`.
304///
305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
306#[inline]
307#[target_feature(enable = "avx2")]
308#[cfg_attr(test, assert_instr(vpavgb))]
309#[stable(feature = "simd_x86", since = "1.27.0")]
310#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
311pub const fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
312    unsafe {
313        let a = simd_cast::<_, u16x32>(a.as_u8x32());
314        let b = simd_cast::<_, u16x32>(b.as_u8x32());
315        let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
316        transmute(simd_cast::<_, u8x32>(r))
317    }
318}
319
320/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
321///
322/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
323#[inline]
324#[target_feature(enable = "avx2")]
325#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
326#[rustc_legacy_const_generics(2)]
327#[stable(feature = "simd_x86", since = "1.27.0")]
328#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
329pub const fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
330    static_assert_uimm_bits!(IMM4, 4);
331    unsafe {
332        let a = a.as_i32x4();
333        let b = b.as_i32x4();
334        let r: i32x4 = simd_shuffle!(
335            a,
336            b,
337            [
338                [0, 4, 0, 4][IMM4 as usize & 0b11],
339                [1, 1, 5, 5][IMM4 as usize & 0b11],
340                [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
341                [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
342            ],
343        );
344        transmute(r)
345    }
346}
347
348/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
349///
350/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
351#[inline]
352#[target_feature(enable = "avx2")]
353#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
354#[rustc_legacy_const_generics(2)]
355#[stable(feature = "simd_x86", since = "1.27.0")]
356#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
357pub const fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
358    static_assert_uimm_bits!(IMM8, 8);
359    unsafe {
360        let a = a.as_i32x8();
361        let b = b.as_i32x8();
362        let r: i32x8 = simd_shuffle!(
363            a,
364            b,
365            [
366                [0, 8, 0, 8][IMM8 as usize & 0b11],
367                [1, 1, 9, 9][IMM8 as usize & 0b11],
368                [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
369                [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
370                [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
371                [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
372                [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
373                [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
374            ],
375        );
376        transmute(r)
377    }
378}
379
380/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
381///
382/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
383#[inline]
384#[target_feature(enable = "avx2")]
385#[cfg_attr(test, assert_instr(vpblendw, IMM8 = 9))]
386#[rustc_legacy_const_generics(2)]
387#[stable(feature = "simd_x86", since = "1.27.0")]
388#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
389pub const fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
390    static_assert_uimm_bits!(IMM8, 8);
391    unsafe {
392        let a = a.as_i16x16();
393        let b = b.as_i16x16();
394
395        let r: i16x16 = simd_shuffle!(
396            a,
397            b,
398            [
399                [0, 16, 0, 16][IMM8 as usize & 0b11],
400                [1, 1, 17, 17][IMM8 as usize & 0b11],
401                [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
402                [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
403                [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
404                [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
405                [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
406                [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
407                [8, 24, 8, 24][IMM8 as usize & 0b11],
408                [9, 9, 25, 25][IMM8 as usize & 0b11],
409                [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
410                [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
411                [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
412                [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
413                [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
414                [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
415            ],
416        );
417        transmute(r)
418    }
419}
420
421/// Blends packed 8-bit integers from `a` and `b` using `mask`.
422///
423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
424#[inline]
425#[target_feature(enable = "avx2")]
426#[cfg_attr(test, assert_instr(vpblendvb))]
427#[stable(feature = "simd_x86", since = "1.27.0")]
428#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
429pub const fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
430    unsafe {
431        let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO);
432        transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
433    }
434}
435
436/// Broadcasts the low packed 8-bit integer from `a` to all elements of
437/// the 128-bit returned value.
438///
439/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
440#[inline]
441#[target_feature(enable = "avx2")]
442#[cfg_attr(test, assert_instr(vpbroadcastb))]
443#[stable(feature = "simd_x86", since = "1.27.0")]
444#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
445pub const fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
446    unsafe {
447        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]);
448        transmute::<i8x16, _>(ret)
449    }
450}
451
452/// Broadcasts the low packed 8-bit integer from `a` to all elements of
453/// the 256-bit returned value.
454///
455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
456#[inline]
457#[target_feature(enable = "avx2")]
458#[cfg_attr(test, assert_instr(vpbroadcastb))]
459#[stable(feature = "simd_x86", since = "1.27.0")]
460#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
461pub const fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
462    unsafe {
463        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]);
464        transmute::<i8x32, _>(ret)
465    }
466}
467
468// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
469// often compiled to `vbroadcastss`.
470/// Broadcasts the low packed 32-bit integer from `a` to all elements of
471/// the 128-bit returned value.
472///
473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
474#[inline]
475#[target_feature(enable = "avx2")]
476#[cfg_attr(test, assert_instr(vbroadcastss))]
477#[stable(feature = "simd_x86", since = "1.27.0")]
478#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
479pub const fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
480    unsafe {
481        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]);
482        transmute::<i32x4, _>(ret)
483    }
484}
485
486// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
487// often compiled to `vbroadcastss`.
488/// Broadcasts the low packed 32-bit integer from `a` to all elements of
489/// the 256-bit returned value.
490///
491/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
492#[inline]
493#[target_feature(enable = "avx2")]
494#[cfg_attr(test, assert_instr(vbroadcastss))]
495#[stable(feature = "simd_x86", since = "1.27.0")]
496#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
497pub const fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
498    unsafe {
499        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]);
500        transmute::<i32x8, _>(ret)
501    }
502}
503
504/// Broadcasts the low packed 64-bit integer from `a` to all elements of
505/// the 128-bit returned value.
506///
507/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
508#[inline]
509#[target_feature(enable = "avx2")]
510// Emits `vmovddup` instead of `vpbroadcastq`
511// See https://github.com/rust-lang/stdarch/issues/791
512#[cfg_attr(test, assert_instr(vmovddup))]
513#[stable(feature = "simd_x86", since = "1.27.0")]
514#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
515pub const fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
516    unsafe {
517        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
518        transmute::<i64x2, _>(ret)
519    }
520}
521
522/// Broadcasts the low packed 64-bit integer from `a` to all elements of
523/// the 256-bit returned value.
524///
525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
526#[inline]
527#[target_feature(enable = "avx2")]
528#[cfg_attr(test, assert_instr(vbroadcastsd))]
529#[stable(feature = "simd_x86", since = "1.27.0")]
530#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
531pub const fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
532    unsafe {
533        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
534        transmute::<i64x4, _>(ret)
535    }
536}
537
538/// Broadcasts the low double-precision (64-bit) floating-point element
539/// from `a` to all elements of the 128-bit returned value.
540///
541/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
542#[inline]
543#[target_feature(enable = "avx2")]
544#[cfg_attr(test, assert_instr(vmovddup))]
545#[stable(feature = "simd_x86", since = "1.27.0")]
546#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
547pub const fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
548    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 2]) }
549}
550
551/// Broadcasts the low double-precision (64-bit) floating-point element
552/// from `a` to all elements of the 256-bit returned value.
553///
554/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
555#[inline]
556#[target_feature(enable = "avx2")]
557#[cfg_attr(test, assert_instr(vbroadcastsd))]
558#[stable(feature = "simd_x86", since = "1.27.0")]
559#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
560pub const fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
561    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4]) }
562}
563
564/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
565/// the 256-bit returned value.
566///
567/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
568#[inline]
569#[target_feature(enable = "avx2")]
570#[stable(feature = "simd_x86_updates", since = "1.82.0")]
571#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
572pub const fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
573    unsafe {
574        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
575        transmute::<i64x4, _>(ret)
576    }
577}
578
579// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
580// `vbroadcastf128`.
581/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
582/// the 256-bit returned value.
583///
584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
585#[inline]
586#[target_feature(enable = "avx2")]
587#[stable(feature = "simd_x86", since = "1.27.0")]
588#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
589pub const fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
590    unsafe {
591        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
592        transmute::<i64x4, _>(ret)
593    }
594}
595
596/// Broadcasts the low single-precision (32-bit) floating-point element
597/// from `a` to all elements of the 128-bit returned value.
598///
599/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
600#[inline]
601#[target_feature(enable = "avx2")]
602#[cfg_attr(test, assert_instr(vbroadcastss))]
603#[stable(feature = "simd_x86", since = "1.27.0")]
604#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
605pub const fn _mm_broadcastss_ps(a: __m128) -> __m128 {
606    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 4]) }
607}
608
609/// Broadcasts the low single-precision (32-bit) floating-point element
610/// from `a` to all elements of the 256-bit returned value.
611///
612/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
613#[inline]
614#[target_feature(enable = "avx2")]
615#[cfg_attr(test, assert_instr(vbroadcastss))]
616#[stable(feature = "simd_x86", since = "1.27.0")]
617#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
618pub const fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
619    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 8]) }
620}
621
622/// Broadcasts the low packed 16-bit integer from a to all elements of
623/// the 128-bit returned value
624///
625/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
626#[inline]
627#[target_feature(enable = "avx2")]
628#[cfg_attr(test, assert_instr(vpbroadcastw))]
629#[stable(feature = "simd_x86", since = "1.27.0")]
630#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
631pub const fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
632    unsafe {
633        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]);
634        transmute::<i16x8, _>(ret)
635    }
636}
637
638/// Broadcasts the low packed 16-bit integer from a to all elements of
639/// the 256-bit returned value
640///
641/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
642#[inline]
643#[target_feature(enable = "avx2")]
644#[cfg_attr(test, assert_instr(vpbroadcastw))]
645#[stable(feature = "simd_x86", since = "1.27.0")]
646#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
647pub const fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
648    unsafe {
649        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]);
650        transmute::<i16x16, _>(ret)
651    }
652}
653
654/// Compares packed 64-bit integers in `a` and `b` for equality.
655///
656/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
657#[inline]
658#[target_feature(enable = "avx2")]
659#[cfg_attr(test, assert_instr(vpcmpeqq))]
660#[stable(feature = "simd_x86", since = "1.27.0")]
661#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
662pub const fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
663    unsafe { transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4())) }
664}
665
666/// Compares packed 32-bit integers in `a` and `b` for equality.
667///
668/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
669#[inline]
670#[target_feature(enable = "avx2")]
671#[cfg_attr(test, assert_instr(vpcmpeqd))]
672#[stable(feature = "simd_x86", since = "1.27.0")]
673#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
674pub const fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
675    unsafe { transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8())) }
676}
677
678/// Compares packed 16-bit integers in `a` and `b` for equality.
679///
680/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
681#[inline]
682#[target_feature(enable = "avx2")]
683#[cfg_attr(test, assert_instr(vpcmpeqw))]
684#[stable(feature = "simd_x86", since = "1.27.0")]
685#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
686pub const fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
687    unsafe { transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16())) }
688}
689
690/// Compares packed 8-bit integers in `a` and `b` for equality.
691///
692/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
693#[inline]
694#[target_feature(enable = "avx2")]
695#[cfg_attr(test, assert_instr(vpcmpeqb))]
696#[stable(feature = "simd_x86", since = "1.27.0")]
697#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
698pub const fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
699    unsafe { transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32())) }
700}
701
702/// Compares packed 64-bit integers in `a` and `b` for greater-than.
703///
704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
705#[inline]
706#[target_feature(enable = "avx2")]
707#[cfg_attr(test, assert_instr(vpcmpgtq))]
708#[stable(feature = "simd_x86", since = "1.27.0")]
709#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
710pub const fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
711    unsafe { transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4())) }
712}
713
714/// Compares packed 32-bit integers in `a` and `b` for greater-than.
715///
716/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
717#[inline]
718#[target_feature(enable = "avx2")]
719#[cfg_attr(test, assert_instr(vpcmpgtd))]
720#[stable(feature = "simd_x86", since = "1.27.0")]
721#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
722pub const fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
723    unsafe { transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8())) }
724}
725
726/// Compares packed 16-bit integers in `a` and `b` for greater-than.
727///
728/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
729#[inline]
730#[target_feature(enable = "avx2")]
731#[cfg_attr(test, assert_instr(vpcmpgtw))]
732#[stable(feature = "simd_x86", since = "1.27.0")]
733#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
734pub const fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
735    unsafe { transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16())) }
736}
737
738/// Compares packed 8-bit integers in `a` and `b` for greater-than.
739///
740/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
741#[inline]
742#[target_feature(enable = "avx2")]
743#[cfg_attr(test, assert_instr(vpcmpgtb))]
744#[stable(feature = "simd_x86", since = "1.27.0")]
745#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
746pub const fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
747    unsafe { transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32())) }
748}
749
750/// Sign-extend 16-bit integers to 32-bit integers.
751///
752/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
753#[inline]
754#[target_feature(enable = "avx2")]
755#[cfg_attr(test, assert_instr(vpmovsxwd))]
756#[stable(feature = "simd_x86", since = "1.27.0")]
757#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
758pub const fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
759    unsafe { transmute::<i32x8, _>(simd_cast(a.as_i16x8())) }
760}
761
762/// Sign-extend 16-bit integers to 64-bit integers.
763///
764/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
765#[inline]
766#[target_feature(enable = "avx2")]
767#[cfg_attr(test, assert_instr(vpmovsxwq))]
768#[stable(feature = "simd_x86", since = "1.27.0")]
769#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
770pub const fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
771    unsafe {
772        let a = a.as_i16x8();
773        let v64: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
774        transmute::<i64x4, _>(simd_cast(v64))
775    }
776}
777
778/// Sign-extend 32-bit integers to 64-bit integers.
779///
780/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
781#[inline]
782#[target_feature(enable = "avx2")]
783#[cfg_attr(test, assert_instr(vpmovsxdq))]
784#[stable(feature = "simd_x86", since = "1.27.0")]
785#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
786pub const fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
787    unsafe { transmute::<i64x4, _>(simd_cast(a.as_i32x4())) }
788}
789
790/// Sign-extend 8-bit integers to 16-bit integers.
791///
792/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
793#[inline]
794#[target_feature(enable = "avx2")]
795#[cfg_attr(test, assert_instr(vpmovsxbw))]
796#[stable(feature = "simd_x86", since = "1.27.0")]
797#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
798pub const fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
799    unsafe { transmute::<i16x16, _>(simd_cast(a.as_i8x16())) }
800}
801
802/// Sign-extend 8-bit integers to 32-bit integers.
803///
804/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
805#[inline]
806#[target_feature(enable = "avx2")]
807#[cfg_attr(test, assert_instr(vpmovsxbd))]
808#[stable(feature = "simd_x86", since = "1.27.0")]
809#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
810pub const fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
811    unsafe {
812        let a = a.as_i8x16();
813        let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
814        transmute::<i32x8, _>(simd_cast(v64))
815    }
816}
817
818/// Sign-extend 8-bit integers to 64-bit integers.
819///
820/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
821#[inline]
822#[target_feature(enable = "avx2")]
823#[cfg_attr(test, assert_instr(vpmovsxbq))]
824#[stable(feature = "simd_x86", since = "1.27.0")]
825#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
826pub const fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
827    unsafe {
828        let a = a.as_i8x16();
829        let v32: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
830        transmute::<i64x4, _>(simd_cast(v32))
831    }
832}
833
834/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
835/// integers, and stores the results in `dst`.
836///
837/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
838#[inline]
839#[target_feature(enable = "avx2")]
840#[cfg_attr(test, assert_instr(vpmovzxwd))]
841#[stable(feature = "simd_x86", since = "1.27.0")]
842#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
843pub const fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
844    unsafe { transmute::<i32x8, _>(simd_cast(a.as_u16x8())) }
845}
846
847/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
848/// integers. The upper four elements of `a` are unused.
849///
850/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
851#[inline]
852#[target_feature(enable = "avx2")]
853#[cfg_attr(test, assert_instr(vpmovzxwq))]
854#[stable(feature = "simd_x86", since = "1.27.0")]
855#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
856pub const fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
857    unsafe {
858        let a = a.as_u16x8();
859        let v64: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
860        transmute::<i64x4, _>(simd_cast(v64))
861    }
862}
863
864/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
865///
866/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
867#[inline]
868#[target_feature(enable = "avx2")]
869#[cfg_attr(test, assert_instr(vpmovzxdq))]
870#[stable(feature = "simd_x86", since = "1.27.0")]
871#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
872pub const fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
873    unsafe { transmute::<i64x4, _>(simd_cast(a.as_u32x4())) }
874}
875
876/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
877///
878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
879#[inline]
880#[target_feature(enable = "avx2")]
881#[cfg_attr(test, assert_instr(vpmovzxbw))]
882#[stable(feature = "simd_x86", since = "1.27.0")]
883#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
884pub const fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
885    unsafe { transmute::<i16x16, _>(simd_cast(a.as_u8x16())) }
886}
887
888/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
889/// integers. The upper eight elements of `a` are unused.
890///
891/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
892#[inline]
893#[target_feature(enable = "avx2")]
894#[cfg_attr(test, assert_instr(vpmovzxbd))]
895#[stable(feature = "simd_x86", since = "1.27.0")]
896#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
897pub const fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
898    unsafe {
899        let a = a.as_u8x16();
900        let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
901        transmute::<i32x8, _>(simd_cast(v64))
902    }
903}
904
905/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
906/// integers. The upper twelve elements of `a` are unused.
907///
908/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
909#[inline]
910#[target_feature(enable = "avx2")]
911#[cfg_attr(test, assert_instr(vpmovzxbq))]
912#[stable(feature = "simd_x86", since = "1.27.0")]
913#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
914pub const fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
915    unsafe {
916        let a = a.as_u8x16();
917        let v32: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
918        transmute::<i64x4, _>(simd_cast(v32))
919    }
920}
921
922/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
923///
924/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
925#[inline]
926#[target_feature(enable = "avx2")]
927#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
928#[rustc_legacy_const_generics(1)]
929#[stable(feature = "simd_x86", since = "1.27.0")]
930#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
931pub const fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
932    static_assert_uimm_bits!(IMM1, 1);
933    unsafe {
934        let a = a.as_i64x4();
935        let b = i64x4::ZERO;
936        let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
937        transmute(dst)
938    }
939}
940
941/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
942///
943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
944#[inline]
945#[target_feature(enable = "avx2")]
946#[cfg_attr(test, assert_instr(vphaddw))]
947#[stable(feature = "simd_x86", since = "1.27.0")]
948#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
949pub const fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
950    let a = a.as_i16x16();
951    let b = b.as_i16x16();
952    unsafe {
953        let even: i16x16 = simd_shuffle!(
954            a,
955            b,
956            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
957        );
958        let odd: i16x16 = simd_shuffle!(
959            a,
960            b,
961            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
962        );
963        simd_add(even, odd).as_m256i()
964    }
965}
966
967/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
968///
969/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
970#[inline]
971#[target_feature(enable = "avx2")]
972#[cfg_attr(test, assert_instr(vphaddd))]
973#[stable(feature = "simd_x86", since = "1.27.0")]
974#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
975pub const fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
976    let a = a.as_i32x8();
977    let b = b.as_i32x8();
978    unsafe {
979        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
980        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
981        simd_add(even, odd).as_m256i()
982    }
983}
984
985/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
986/// using saturation.
987///
988/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
989#[inline]
990#[target_feature(enable = "avx2")]
991#[cfg_attr(test, assert_instr(vphaddsw))]
992#[stable(feature = "simd_x86", since = "1.27.0")]
993pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
994    unsafe { transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) }
995}
996
997/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
998///
999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
1000#[inline]
1001#[target_feature(enable = "avx2")]
1002#[cfg_attr(test, assert_instr(vphsubw))]
1003#[stable(feature = "simd_x86", since = "1.27.0")]
1004#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1005pub const fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
1006    let a = a.as_i16x16();
1007    let b = b.as_i16x16();
1008    unsafe {
1009        let even: i16x16 = simd_shuffle!(
1010            a,
1011            b,
1012            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
1013        );
1014        let odd: i16x16 = simd_shuffle!(
1015            a,
1016            b,
1017            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
1018        );
1019        simd_sub(even, odd).as_m256i()
1020    }
1021}
1022
1023/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
1024///
1025/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
1026#[inline]
1027#[target_feature(enable = "avx2")]
1028#[cfg_attr(test, assert_instr(vphsubd))]
1029#[stable(feature = "simd_x86", since = "1.27.0")]
1030#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1031pub const fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
1032    let a = a.as_i32x8();
1033    let b = b.as_i32x8();
1034    unsafe {
1035        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
1036        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
1037        simd_sub(even, odd).as_m256i()
1038    }
1039}
1040
1041/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
1042/// using saturation.
1043///
1044/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
1045#[inline]
1046#[target_feature(enable = "avx2")]
1047#[cfg_attr(test, assert_instr(vphsubsw))]
1048#[stable(feature = "simd_x86", since = "1.27.0")]
1049pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1050    unsafe { transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) }
1051}
1052
1053/// Returns values from `slice` at offsets determined by `offsets * scale`,
1054/// where
1055/// `scale` should be 1, 2, 4 or 8.
1056///
1057/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi32)
1058#[inline]
1059#[target_feature(enable = "avx2")]
1060#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1061#[rustc_legacy_const_generics(2)]
1062#[stable(feature = "simd_x86", since = "1.27.0")]
1063pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
1064    slice: *const i32,
1065    offsets: __m128i,
1066) -> __m128i {
1067    static_assert_imm8_scale!(SCALE);
1068    let zero = i32x4::ZERO;
1069    let neg_one = _mm_set1_epi32(-1).as_i32x4();
1070    let offsets = offsets.as_i32x4();
1071    let slice = slice as *const i8;
1072    let r = pgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1073    transmute(r)
1074}
1075
1076/// Returns values from `slice` at offsets determined by `offsets * scale`,
1077/// where
1078/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1079/// that position instead.
1080///
1081/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi32)
1082#[inline]
1083#[target_feature(enable = "avx2")]
1084#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1085#[rustc_legacy_const_generics(4)]
1086#[stable(feature = "simd_x86", since = "1.27.0")]
1087pub unsafe fn _mm_mask_i32gather_epi32<const SCALE: i32>(
1088    src: __m128i,
1089    slice: *const i32,
1090    offsets: __m128i,
1091    mask: __m128i,
1092) -> __m128i {
1093    static_assert_imm8_scale!(SCALE);
1094    let src = src.as_i32x4();
1095    let mask = mask.as_i32x4();
1096    let offsets = offsets.as_i32x4();
1097    let slice = slice as *const i8;
1098    let r = pgatherdd(src, slice, offsets, mask, SCALE as i8);
1099    transmute(r)
1100}
1101
1102/// Returns values from `slice` at offsets determined by `offsets * scale`,
1103/// where
1104/// `scale` should be 1, 2, 4 or 8.
1105///
1106/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi32)
1107#[inline]
1108#[target_feature(enable = "avx2")]
1109#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1110#[rustc_legacy_const_generics(2)]
1111#[stable(feature = "simd_x86", since = "1.27.0")]
1112pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
1113    slice: *const i32,
1114    offsets: __m256i,
1115) -> __m256i {
1116    static_assert_imm8_scale!(SCALE);
1117    let zero = i32x8::ZERO;
1118    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1119    let offsets = offsets.as_i32x8();
1120    let slice = slice as *const i8;
1121    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1122    transmute(r)
1123}
1124
1125/// Returns values from `slice` at offsets determined by `offsets * scale`,
1126/// where
1127/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1128/// that position instead.
1129///
1130/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi32)
1131#[inline]
1132#[target_feature(enable = "avx2")]
1133#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1134#[rustc_legacy_const_generics(4)]
1135#[stable(feature = "simd_x86", since = "1.27.0")]
1136pub unsafe fn _mm256_mask_i32gather_epi32<const SCALE: i32>(
1137    src: __m256i,
1138    slice: *const i32,
1139    offsets: __m256i,
1140    mask: __m256i,
1141) -> __m256i {
1142    static_assert_imm8_scale!(SCALE);
1143    let src = src.as_i32x8();
1144    let mask = mask.as_i32x8();
1145    let offsets = offsets.as_i32x8();
1146    let slice = slice as *const i8;
1147    let r = vpgatherdd(src, slice, offsets, mask, SCALE as i8);
1148    transmute(r)
1149}
1150
1151/// Returns values from `slice` at offsets determined by `offsets * scale`,
1152/// where
1153/// `scale` should be 1, 2, 4 or 8.
1154///
1155/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_ps)
1156#[inline]
1157#[target_feature(enable = "avx2")]
1158#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1159#[rustc_legacy_const_generics(2)]
1160#[stable(feature = "simd_x86", since = "1.27.0")]
1161pub unsafe fn _mm_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1162    static_assert_imm8_scale!(SCALE);
1163    let zero = _mm_setzero_ps();
1164    let neg_one = _mm_set1_ps(-1.0);
1165    let offsets = offsets.as_i32x4();
1166    let slice = slice as *const i8;
1167    pgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1168}
1169
1170/// Returns values from `slice` at offsets determined by `offsets * scale`,
1171/// where
1172/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1173/// that position instead.
1174///
1175/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_ps)
1176#[inline]
1177#[target_feature(enable = "avx2")]
1178#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1179#[rustc_legacy_const_generics(4)]
1180#[stable(feature = "simd_x86", since = "1.27.0")]
1181pub unsafe fn _mm_mask_i32gather_ps<const SCALE: i32>(
1182    src: __m128,
1183    slice: *const f32,
1184    offsets: __m128i,
1185    mask: __m128,
1186) -> __m128 {
1187    static_assert_imm8_scale!(SCALE);
1188    let offsets = offsets.as_i32x4();
1189    let slice = slice as *const i8;
1190    pgatherdps(src, slice, offsets, mask, SCALE as i8)
1191}
1192
1193/// Returns values from `slice` at offsets determined by `offsets * scale`,
1194/// where
1195/// `scale` should be 1, 2, 4 or 8.
1196///
1197/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_ps)
1198#[inline]
1199#[target_feature(enable = "avx2")]
1200#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1201#[rustc_legacy_const_generics(2)]
1202#[stable(feature = "simd_x86", since = "1.27.0")]
1203pub unsafe fn _mm256_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m256 {
1204    static_assert_imm8_scale!(SCALE);
1205    let zero = _mm256_setzero_ps();
1206    let neg_one = _mm256_set1_ps(-1.0);
1207    let offsets = offsets.as_i32x8();
1208    let slice = slice as *const i8;
1209    vpgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1210}
1211
1212/// Returns values from `slice` at offsets determined by `offsets * scale`,
1213/// where
1214/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1215/// that position instead.
1216///
1217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_ps)
1218#[inline]
1219#[target_feature(enable = "avx2")]
1220#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1221#[rustc_legacy_const_generics(4)]
1222#[stable(feature = "simd_x86", since = "1.27.0")]
1223pub unsafe fn _mm256_mask_i32gather_ps<const SCALE: i32>(
1224    src: __m256,
1225    slice: *const f32,
1226    offsets: __m256i,
1227    mask: __m256,
1228) -> __m256 {
1229    static_assert_imm8_scale!(SCALE);
1230    let offsets = offsets.as_i32x8();
1231    let slice = slice as *const i8;
1232    vpgatherdps(src, slice, offsets, mask, SCALE as i8)
1233}
1234
1235/// Returns values from `slice` at offsets determined by `offsets * scale`,
1236/// where
1237/// `scale` should be 1, 2, 4 or 8.
1238///
1239/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi64)
1240#[inline]
1241#[target_feature(enable = "avx2")]
1242#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1243#[rustc_legacy_const_generics(2)]
1244#[stable(feature = "simd_x86", since = "1.27.0")]
1245pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
1246    slice: *const i64,
1247    offsets: __m128i,
1248) -> __m128i {
1249    static_assert_imm8_scale!(SCALE);
1250    let zero = i64x2::ZERO;
1251    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1252    let offsets = offsets.as_i32x4();
1253    let slice = slice as *const i8;
1254    let r = pgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1255    transmute(r)
1256}
1257
1258/// Returns values from `slice` at offsets determined by `offsets * scale`,
1259/// where
1260/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1261/// that position instead.
1262///
1263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi64)
1264#[inline]
1265#[target_feature(enable = "avx2")]
1266#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1267#[rustc_legacy_const_generics(4)]
1268#[stable(feature = "simd_x86", since = "1.27.0")]
1269pub unsafe fn _mm_mask_i32gather_epi64<const SCALE: i32>(
1270    src: __m128i,
1271    slice: *const i64,
1272    offsets: __m128i,
1273    mask: __m128i,
1274) -> __m128i {
1275    static_assert_imm8_scale!(SCALE);
1276    let src = src.as_i64x2();
1277    let mask = mask.as_i64x2();
1278    let offsets = offsets.as_i32x4();
1279    let slice = slice as *const i8;
1280    let r = pgatherdq(src, slice, offsets, mask, SCALE as i8);
1281    transmute(r)
1282}
1283
1284/// Returns values from `slice` at offsets determined by `offsets * scale`,
1285/// where
1286/// `scale` should be 1, 2, 4 or 8.
1287///
1288/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi64)
1289#[inline]
1290#[target_feature(enable = "avx2")]
1291#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1292#[rustc_legacy_const_generics(2)]
1293#[stable(feature = "simd_x86", since = "1.27.0")]
1294pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
1295    slice: *const i64,
1296    offsets: __m128i,
1297) -> __m256i {
1298    static_assert_imm8_scale!(SCALE);
1299    let zero = i64x4::ZERO;
1300    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1301    let offsets = offsets.as_i32x4();
1302    let slice = slice as *const i8;
1303    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1304    transmute(r)
1305}
1306
1307/// Returns values from `slice` at offsets determined by `offsets * scale`,
1308/// where
1309/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1310/// that position instead.
1311///
1312/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi64)
1313#[inline]
1314#[target_feature(enable = "avx2")]
1315#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1316#[rustc_legacy_const_generics(4)]
1317#[stable(feature = "simd_x86", since = "1.27.0")]
1318pub unsafe fn _mm256_mask_i32gather_epi64<const SCALE: i32>(
1319    src: __m256i,
1320    slice: *const i64,
1321    offsets: __m128i,
1322    mask: __m256i,
1323) -> __m256i {
1324    static_assert_imm8_scale!(SCALE);
1325    let src = src.as_i64x4();
1326    let mask = mask.as_i64x4();
1327    let offsets = offsets.as_i32x4();
1328    let slice = slice as *const i8;
1329    let r = vpgatherdq(src, slice, offsets, mask, SCALE as i8);
1330    transmute(r)
1331}
1332
1333/// Returns values from `slice` at offsets determined by `offsets * scale`,
1334/// where
1335/// `scale` should be 1, 2, 4 or 8.
1336///
1337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_pd)
1338#[inline]
1339#[target_feature(enable = "avx2")]
1340#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1341#[rustc_legacy_const_generics(2)]
1342#[stable(feature = "simd_x86", since = "1.27.0")]
1343pub unsafe fn _mm_i32gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1344    static_assert_imm8_scale!(SCALE);
1345    let zero = _mm_setzero_pd();
1346    let neg_one = _mm_set1_pd(-1.0);
1347    let offsets = offsets.as_i32x4();
1348    let slice = slice as *const i8;
1349    pgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1350}
1351
1352/// Returns values from `slice` at offsets determined by `offsets * scale`,
1353/// where
1354/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1355/// that position instead.
1356///
1357/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_pd)
1358#[inline]
1359#[target_feature(enable = "avx2")]
1360#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1361#[rustc_legacy_const_generics(4)]
1362#[stable(feature = "simd_x86", since = "1.27.0")]
1363pub unsafe fn _mm_mask_i32gather_pd<const SCALE: i32>(
1364    src: __m128d,
1365    slice: *const f64,
1366    offsets: __m128i,
1367    mask: __m128d,
1368) -> __m128d {
1369    static_assert_imm8_scale!(SCALE);
1370    let offsets = offsets.as_i32x4();
1371    let slice = slice as *const i8;
1372    pgatherdpd(src, slice, offsets, mask, SCALE as i8)
1373}
1374
1375/// Returns values from `slice` at offsets determined by `offsets * scale`,
1376/// where
1377/// `scale` should be 1, 2, 4 or 8.
1378///
1379/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_pd)
1380#[inline]
1381#[target_feature(enable = "avx2")]
1382#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1383#[rustc_legacy_const_generics(2)]
1384#[stable(feature = "simd_x86", since = "1.27.0")]
1385pub unsafe fn _mm256_i32gather_pd<const SCALE: i32>(
1386    slice: *const f64,
1387    offsets: __m128i,
1388) -> __m256d {
1389    static_assert_imm8_scale!(SCALE);
1390    let zero = _mm256_setzero_pd();
1391    let neg_one = _mm256_set1_pd(-1.0);
1392    let offsets = offsets.as_i32x4();
1393    let slice = slice as *const i8;
1394    vpgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1395}
1396
1397/// Returns values from `slice` at offsets determined by `offsets * scale`,
1398/// where
1399/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1400/// that position instead.
1401///
1402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_pd)
1403#[inline]
1404#[target_feature(enable = "avx2")]
1405#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1406#[rustc_legacy_const_generics(4)]
1407#[stable(feature = "simd_x86", since = "1.27.0")]
1408pub unsafe fn _mm256_mask_i32gather_pd<const SCALE: i32>(
1409    src: __m256d,
1410    slice: *const f64,
1411    offsets: __m128i,
1412    mask: __m256d,
1413) -> __m256d {
1414    static_assert_imm8_scale!(SCALE);
1415    let offsets = offsets.as_i32x4();
1416    let slice = slice as *const i8;
1417    vpgatherdpd(src, slice, offsets, mask, SCALE as i8)
1418}
1419
1420/// Returns values from `slice` at offsets determined by `offsets * scale`,
1421/// where
1422/// `scale` should be 1, 2, 4 or 8.
1423///
1424/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi32)
1425#[inline]
1426#[target_feature(enable = "avx2")]
1427#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1428#[rustc_legacy_const_generics(2)]
1429#[stable(feature = "simd_x86", since = "1.27.0")]
1430pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
1431    slice: *const i32,
1432    offsets: __m128i,
1433) -> __m128i {
1434    static_assert_imm8_scale!(SCALE);
1435    let zero = i32x4::ZERO;
1436    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1437    let offsets = offsets.as_i64x2();
1438    let slice = slice as *const i8;
1439    let r = pgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1440    transmute(r)
1441}
1442
1443/// Returns values from `slice` at offsets determined by `offsets * scale`,
1444/// where
1445/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1446/// that position instead.
1447///
1448/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi32)
1449#[inline]
1450#[target_feature(enable = "avx2")]
1451#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1452#[rustc_legacy_const_generics(4)]
1453#[stable(feature = "simd_x86", since = "1.27.0")]
1454pub unsafe fn _mm_mask_i64gather_epi32<const SCALE: i32>(
1455    src: __m128i,
1456    slice: *const i32,
1457    offsets: __m128i,
1458    mask: __m128i,
1459) -> __m128i {
1460    static_assert_imm8_scale!(SCALE);
1461    let src = src.as_i32x4();
1462    let mask = mask.as_i32x4();
1463    let offsets = offsets.as_i64x2();
1464    let slice = slice as *const i8;
1465    let r = pgatherqd(src, slice, offsets, mask, SCALE as i8);
1466    transmute(r)
1467}
1468
1469/// Returns values from `slice` at offsets determined by `offsets * scale`,
1470/// where
1471/// `scale` should be 1, 2, 4 or 8.
1472///
1473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi32)
1474#[inline]
1475#[target_feature(enable = "avx2")]
1476#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1477#[rustc_legacy_const_generics(2)]
1478#[stable(feature = "simd_x86", since = "1.27.0")]
1479pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
1480    slice: *const i32,
1481    offsets: __m256i,
1482) -> __m128i {
1483    static_assert_imm8_scale!(SCALE);
1484    let zero = i32x4::ZERO;
1485    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1486    let offsets = offsets.as_i64x4();
1487    let slice = slice as *const i8;
1488    let r = vpgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1489    transmute(r)
1490}
1491
1492/// Returns values from `slice` at offsets determined by `offsets * scale`,
1493/// where
1494/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1495/// that position instead.
1496///
1497/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi32)
1498#[inline]
1499#[target_feature(enable = "avx2")]
1500#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1501#[rustc_legacy_const_generics(4)]
1502#[stable(feature = "simd_x86", since = "1.27.0")]
1503pub unsafe fn _mm256_mask_i64gather_epi32<const SCALE: i32>(
1504    src: __m128i,
1505    slice: *const i32,
1506    offsets: __m256i,
1507    mask: __m128i,
1508) -> __m128i {
1509    static_assert_imm8_scale!(SCALE);
1510    let src = src.as_i32x4();
1511    let mask = mask.as_i32x4();
1512    let offsets = offsets.as_i64x4();
1513    let slice = slice as *const i8;
1514    let r = vpgatherqd(src, slice, offsets, mask, SCALE as i8);
1515    transmute(r)
1516}
1517
1518/// Returns values from `slice` at offsets determined by `offsets * scale`,
1519/// where
1520/// `scale` should be 1, 2, 4 or 8.
1521///
1522/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_ps)
1523#[inline]
1524#[target_feature(enable = "avx2")]
1525#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1526#[rustc_legacy_const_generics(2)]
1527#[stable(feature = "simd_x86", since = "1.27.0")]
1528pub unsafe fn _mm_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1529    static_assert_imm8_scale!(SCALE);
1530    let zero = _mm_setzero_ps();
1531    let neg_one = _mm_set1_ps(-1.0);
1532    let offsets = offsets.as_i64x2();
1533    let slice = slice as *const i8;
1534    pgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1535}
1536
1537/// Returns values from `slice` at offsets determined by `offsets * scale`,
1538/// where
1539/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1540/// that position instead.
1541///
1542/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_ps)
1543#[inline]
1544#[target_feature(enable = "avx2")]
1545#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1546#[rustc_legacy_const_generics(4)]
1547#[stable(feature = "simd_x86", since = "1.27.0")]
1548pub unsafe fn _mm_mask_i64gather_ps<const SCALE: i32>(
1549    src: __m128,
1550    slice: *const f32,
1551    offsets: __m128i,
1552    mask: __m128,
1553) -> __m128 {
1554    static_assert_imm8_scale!(SCALE);
1555    let offsets = offsets.as_i64x2();
1556    let slice = slice as *const i8;
1557    pgatherqps(src, slice, offsets, mask, SCALE as i8)
1558}
1559
1560/// Returns values from `slice` at offsets determined by `offsets * scale`,
1561/// where
1562/// `scale` should be 1, 2, 4 or 8.
1563///
1564/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_ps)
1565#[inline]
1566#[target_feature(enable = "avx2")]
1567#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1568#[rustc_legacy_const_generics(2)]
1569#[stable(feature = "simd_x86", since = "1.27.0")]
1570pub unsafe fn _mm256_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m128 {
1571    static_assert_imm8_scale!(SCALE);
1572    let zero = _mm_setzero_ps();
1573    let neg_one = _mm_set1_ps(-1.0);
1574    let offsets = offsets.as_i64x4();
1575    let slice = slice as *const i8;
1576    vpgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1577}
1578
1579/// Returns values from `slice` at offsets determined by `offsets * scale`,
1580/// where
1581/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1582/// that position instead.
1583///
1584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_ps)
1585#[inline]
1586#[target_feature(enable = "avx2")]
1587#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1588#[rustc_legacy_const_generics(4)]
1589#[stable(feature = "simd_x86", since = "1.27.0")]
1590pub unsafe fn _mm256_mask_i64gather_ps<const SCALE: i32>(
1591    src: __m128,
1592    slice: *const f32,
1593    offsets: __m256i,
1594    mask: __m128,
1595) -> __m128 {
1596    static_assert_imm8_scale!(SCALE);
1597    let offsets = offsets.as_i64x4();
1598    let slice = slice as *const i8;
1599    vpgatherqps(src, slice, offsets, mask, SCALE as i8)
1600}
1601
1602/// Returns values from `slice` at offsets determined by `offsets * scale`,
1603/// where
1604/// `scale` should be 1, 2, 4 or 8.
1605///
1606/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi64)
1607#[inline]
1608#[target_feature(enable = "avx2")]
1609#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1610#[rustc_legacy_const_generics(2)]
1611#[stable(feature = "simd_x86", since = "1.27.0")]
1612pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
1613    slice: *const i64,
1614    offsets: __m128i,
1615) -> __m128i {
1616    static_assert_imm8_scale!(SCALE);
1617    let zero = i64x2::ZERO;
1618    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1619    let slice = slice as *const i8;
1620    let offsets = offsets.as_i64x2();
1621    let r = pgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1622    transmute(r)
1623}
1624
1625/// Returns values from `slice` at offsets determined by `offsets * scale`,
1626/// where
1627/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1628/// that position instead.
1629///
1630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi64)
1631#[inline]
1632#[target_feature(enable = "avx2")]
1633#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1634#[rustc_legacy_const_generics(4)]
1635#[stable(feature = "simd_x86", since = "1.27.0")]
1636pub unsafe fn _mm_mask_i64gather_epi64<const SCALE: i32>(
1637    src: __m128i,
1638    slice: *const i64,
1639    offsets: __m128i,
1640    mask: __m128i,
1641) -> __m128i {
1642    static_assert_imm8_scale!(SCALE);
1643    let src = src.as_i64x2();
1644    let mask = mask.as_i64x2();
1645    let offsets = offsets.as_i64x2();
1646    let slice = slice as *const i8;
1647    let r = pgatherqq(src, slice, offsets, mask, SCALE as i8);
1648    transmute(r)
1649}
1650
1651/// Returns values from `slice` at offsets determined by `offsets * scale`,
1652/// where
1653/// `scale` should be 1, 2, 4 or 8.
1654///
1655/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi64)
1656#[inline]
1657#[target_feature(enable = "avx2")]
1658#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1659#[rustc_legacy_const_generics(2)]
1660#[stable(feature = "simd_x86", since = "1.27.0")]
1661pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
1662    slice: *const i64,
1663    offsets: __m256i,
1664) -> __m256i {
1665    static_assert_imm8_scale!(SCALE);
1666    let zero = i64x4::ZERO;
1667    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1668    let slice = slice as *const i8;
1669    let offsets = offsets.as_i64x4();
1670    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1671    transmute(r)
1672}
1673
1674/// Returns values from `slice` at offsets determined by `offsets * scale`,
1675/// where
1676/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1677/// that position instead.
1678///
1679/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi64)
1680#[inline]
1681#[target_feature(enable = "avx2")]
1682#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1683#[rustc_legacy_const_generics(4)]
1684#[stable(feature = "simd_x86", since = "1.27.0")]
1685pub unsafe fn _mm256_mask_i64gather_epi64<const SCALE: i32>(
1686    src: __m256i,
1687    slice: *const i64,
1688    offsets: __m256i,
1689    mask: __m256i,
1690) -> __m256i {
1691    static_assert_imm8_scale!(SCALE);
1692    let src = src.as_i64x4();
1693    let mask = mask.as_i64x4();
1694    let offsets = offsets.as_i64x4();
1695    let slice = slice as *const i8;
1696    let r = vpgatherqq(src, slice, offsets, mask, SCALE as i8);
1697    transmute(r)
1698}
1699
1700/// Returns values from `slice` at offsets determined by `offsets * scale`,
1701/// where
1702/// `scale` should be 1, 2, 4 or 8.
1703///
1704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd)
1705#[inline]
1706#[target_feature(enable = "avx2")]
1707#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1708#[rustc_legacy_const_generics(2)]
1709#[stable(feature = "simd_x86", since = "1.27.0")]
1710pub unsafe fn _mm_i64gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1711    static_assert_imm8_scale!(SCALE);
1712    let zero = _mm_setzero_pd();
1713    let neg_one = _mm_set1_pd(-1.0);
1714    let slice = slice as *const i8;
1715    let offsets = offsets.as_i64x2();
1716    pgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1717}
1718
1719/// Returns values from `slice` at offsets determined by `offsets * scale`,
1720/// where
1721/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1722/// that position instead.
1723///
1724/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd)
1725#[inline]
1726#[target_feature(enable = "avx2")]
1727#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1728#[rustc_legacy_const_generics(4)]
1729#[stable(feature = "simd_x86", since = "1.27.0")]
1730pub unsafe fn _mm_mask_i64gather_pd<const SCALE: i32>(
1731    src: __m128d,
1732    slice: *const f64,
1733    offsets: __m128i,
1734    mask: __m128d,
1735) -> __m128d {
1736    static_assert_imm8_scale!(SCALE);
1737    let slice = slice as *const i8;
1738    let offsets = offsets.as_i64x2();
1739    pgatherqpd(src, slice, offsets, mask, SCALE as i8)
1740}
1741
1742/// Returns values from `slice` at offsets determined by `offsets * scale`,
1743/// where
1744/// `scale` should be 1, 2, 4 or 8.
1745///
1746/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd)
1747#[inline]
1748#[target_feature(enable = "avx2")]
1749#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1750#[rustc_legacy_const_generics(2)]
1751#[stable(feature = "simd_x86", since = "1.27.0")]
1752pub unsafe fn _mm256_i64gather_pd<const SCALE: i32>(
1753    slice: *const f64,
1754    offsets: __m256i,
1755) -> __m256d {
1756    static_assert_imm8_scale!(SCALE);
1757    let zero = _mm256_setzero_pd();
1758    let neg_one = _mm256_set1_pd(-1.0);
1759    let slice = slice as *const i8;
1760    let offsets = offsets.as_i64x4();
1761    vpgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1762}
1763
1764/// Returns values from `slice` at offsets determined by `offsets * scale`,
1765/// where
1766/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1767/// that position instead.
1768///
1769/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd)
1770#[inline]
1771#[target_feature(enable = "avx2")]
1772#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1773#[rustc_legacy_const_generics(4)]
1774#[stable(feature = "simd_x86", since = "1.27.0")]
1775pub unsafe fn _mm256_mask_i64gather_pd<const SCALE: i32>(
1776    src: __m256d,
1777    slice: *const f64,
1778    offsets: __m256i,
1779    mask: __m256d,
1780) -> __m256d {
1781    static_assert_imm8_scale!(SCALE);
1782    let slice = slice as *const i8;
1783    let offsets = offsets.as_i64x4();
1784    vpgatherqpd(src, slice, offsets, mask, SCALE as i8)
1785}
1786
1787/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1788/// location specified by `IMM1`.
1789///
1790/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
1791#[inline]
1792#[target_feature(enable = "avx2")]
1793#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1794#[rustc_legacy_const_generics(2)]
1795#[stable(feature = "simd_x86", since = "1.27.0")]
1796#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1797pub const fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1798    static_assert_uimm_bits!(IMM1, 1);
1799    unsafe {
1800        let a = a.as_i64x4();
1801        let b = _mm256_castsi128_si256(b).as_i64x4();
1802        let dst: i64x4 = simd_shuffle!(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
1803        transmute(dst)
1804    }
1805}
1806
1807/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1808/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1809/// of intermediate 32-bit integers.
1810///
1811/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
1812#[inline]
1813#[target_feature(enable = "avx2")]
1814#[cfg_attr(test, assert_instr(vpmaddwd))]
1815#[stable(feature = "simd_x86", since = "1.27.0")]
1816pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1817    // It's a trick used in the Adler-32 algorithm to perform a widening addition.
1818    //
1819    // ```rust
1820    // #[target_feature(enable = "avx2")]
1821    // unsafe fn widening_add(mad: __m256i) -> __m256i {
1822    //     _mm256_madd_epi16(mad, _mm256_set1_epi16(1))
1823    // }
1824    // ```
1825    //
1826    // If we implement this using generic vector intrinsics, the optimizer
1827    // will eliminate this pattern, and `vpmaddwd` will no longer be emitted.
1828    // For this reason, we use x86 intrinsics.
1829    unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
1830}
1831
1832/// Vertically multiplies each unsigned 8-bit integer from `a` with the
1833/// corresponding signed 8-bit integer from `b`, producing intermediate
1834/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1835/// signed 16-bit integers
1836///
1837/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
1838#[inline]
1839#[target_feature(enable = "avx2")]
1840#[cfg_attr(test, assert_instr(vpmaddubsw))]
1841#[stable(feature = "simd_x86", since = "1.27.0")]
1842pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1843    unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_i8x32())) }
1844}
1845
1846/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1847/// (elements are zeroed out when the highest bit is not set in the
1848/// corresponding element).
1849///
1850/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi32)
1851#[inline]
1852#[target_feature(enable = "avx2")]
1853#[cfg_attr(test, assert_instr(vpmaskmovd))]
1854#[stable(feature = "simd_x86", since = "1.27.0")]
1855#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1856pub const unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1857    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1858    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x4::ZERO).as_m128i()
1859}
1860
1861/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1862/// (elements are zeroed out when the highest bit is not set in the
1863/// corresponding element).
1864///
1865/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi32)
1866#[inline]
1867#[target_feature(enable = "avx2")]
1868#[cfg_attr(test, assert_instr(vpmaskmovd))]
1869#[stable(feature = "simd_x86", since = "1.27.0")]
1870#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1871pub const unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
1872    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1873    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x8::ZERO).as_m256i()
1874}
1875
1876/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1877/// (elements are zeroed out when the highest bit is not set in the
1878/// corresponding element).
1879///
1880/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi64)
1881#[inline]
1882#[target_feature(enable = "avx2")]
1883#[cfg_attr(test, assert_instr(vpmaskmovq))]
1884#[stable(feature = "simd_x86", since = "1.27.0")]
1885#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1886pub const unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
1887    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1888    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x2::ZERO).as_m128i()
1889}
1890
1891/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1892/// (elements are zeroed out when the highest bit is not set in the
1893/// corresponding element).
1894///
1895/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi64)
1896#[inline]
1897#[target_feature(enable = "avx2")]
1898#[cfg_attr(test, assert_instr(vpmaskmovq))]
1899#[stable(feature = "simd_x86", since = "1.27.0")]
1900#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1901pub const unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
1902    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1903    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x4::ZERO).as_m256i()
1904}
1905
1906/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1907/// using `mask` (elements are not stored when the highest bit is not set
1908/// in the corresponding element).
1909///
1910/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi32)
1911#[inline]
1912#[target_feature(enable = "avx2")]
1913#[cfg_attr(test, assert_instr(vpmaskmovd))]
1914#[stable(feature = "simd_x86", since = "1.27.0")]
1915#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1916pub const unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
1917    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1918    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x4())
1919}
1920
1921/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1922/// using `mask` (elements are not stored when the highest bit is not set
1923/// in the corresponding element).
1924///
1925/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi32)
1926#[inline]
1927#[target_feature(enable = "avx2")]
1928#[cfg_attr(test, assert_instr(vpmaskmovd))]
1929#[stable(feature = "simd_x86", since = "1.27.0")]
1930#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1931pub const unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
1932    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1933    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x8())
1934}
1935
1936/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1937/// using `mask` (elements are not stored when the highest bit is not set
1938/// in the corresponding element).
1939///
1940/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi64)
1941#[inline]
1942#[target_feature(enable = "avx2")]
1943#[cfg_attr(test, assert_instr(vpmaskmovq))]
1944#[stable(feature = "simd_x86", since = "1.27.0")]
1945#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1946pub const unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
1947    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1948    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x2())
1949}
1950
1951/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1952/// using `mask` (elements are not stored when the highest bit is not set
1953/// in the corresponding element).
1954///
1955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi64)
1956#[inline]
1957#[target_feature(enable = "avx2")]
1958#[cfg_attr(test, assert_instr(vpmaskmovq))]
1959#[stable(feature = "simd_x86", since = "1.27.0")]
1960#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1961pub const unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
1962    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1963    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x4())
1964}
1965
1966/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1967/// maximum values.
1968///
1969/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
1970#[inline]
1971#[target_feature(enable = "avx2")]
1972#[cfg_attr(test, assert_instr(vpmaxsw))]
1973#[stable(feature = "simd_x86", since = "1.27.0")]
1974#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1975pub const fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
1976    unsafe { simd_imax(a.as_i16x16(), b.as_i16x16()).as_m256i() }
1977}
1978
1979/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
1980/// maximum values.
1981///
1982/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
1983#[inline]
1984#[target_feature(enable = "avx2")]
1985#[cfg_attr(test, assert_instr(vpmaxsd))]
1986#[stable(feature = "simd_x86", since = "1.27.0")]
1987#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1988pub const fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
1989    unsafe { simd_imax(a.as_i32x8(), b.as_i32x8()).as_m256i() }
1990}
1991
1992/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
1993/// maximum values.
1994///
1995/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
1996#[inline]
1997#[target_feature(enable = "avx2")]
1998#[cfg_attr(test, assert_instr(vpmaxsb))]
1999#[stable(feature = "simd_x86", since = "1.27.0")]
2000#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2001pub const fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
2002    unsafe { simd_imax(a.as_i8x32(), b.as_i8x32()).as_m256i() }
2003}
2004
2005/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2006/// the packed maximum values.
2007///
2008/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
2009#[inline]
2010#[target_feature(enable = "avx2")]
2011#[cfg_attr(test, assert_instr(vpmaxuw))]
2012#[stable(feature = "simd_x86", since = "1.27.0")]
2013#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2014pub const fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
2015    unsafe { simd_imax(a.as_u16x16(), b.as_u16x16()).as_m256i() }
2016}
2017
2018/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2019/// the packed maximum values.
2020///
2021/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
2022#[inline]
2023#[target_feature(enable = "avx2")]
2024#[cfg_attr(test, assert_instr(vpmaxud))]
2025#[stable(feature = "simd_x86", since = "1.27.0")]
2026#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2027pub const fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
2028    unsafe { simd_imax(a.as_u32x8(), b.as_u32x8()).as_m256i() }
2029}
2030
2031/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2032/// the packed maximum values.
2033///
2034/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
2035#[inline]
2036#[target_feature(enable = "avx2")]
2037#[cfg_attr(test, assert_instr(vpmaxub))]
2038#[stable(feature = "simd_x86", since = "1.27.0")]
2039#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2040pub const fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
2041    unsafe { simd_imax(a.as_u8x32(), b.as_u8x32()).as_m256i() }
2042}
2043
2044/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2045/// minimum values.
2046///
2047/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
2048#[inline]
2049#[target_feature(enable = "avx2")]
2050#[cfg_attr(test, assert_instr(vpminsw))]
2051#[stable(feature = "simd_x86", since = "1.27.0")]
2052#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2053pub const fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
2054    unsafe { simd_imin(a.as_i16x16(), b.as_i16x16()).as_m256i() }
2055}
2056
2057/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2058/// minimum values.
2059///
2060/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
2061#[inline]
2062#[target_feature(enable = "avx2")]
2063#[cfg_attr(test, assert_instr(vpminsd))]
2064#[stable(feature = "simd_x86", since = "1.27.0")]
2065#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2066pub const fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
2067    unsafe { simd_imin(a.as_i32x8(), b.as_i32x8()).as_m256i() }
2068}
2069
2070/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2071/// minimum values.
2072///
2073/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
2074#[inline]
2075#[target_feature(enable = "avx2")]
2076#[cfg_attr(test, assert_instr(vpminsb))]
2077#[stable(feature = "simd_x86", since = "1.27.0")]
2078#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2079pub const fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
2080    unsafe { simd_imin(a.as_i8x32(), b.as_i8x32()).as_m256i() }
2081}
2082
2083/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2084/// the packed minimum values.
2085///
2086/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
2087#[inline]
2088#[target_feature(enable = "avx2")]
2089#[cfg_attr(test, assert_instr(vpminuw))]
2090#[stable(feature = "simd_x86", since = "1.27.0")]
2091#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2092pub const fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2093    unsafe { simd_imin(a.as_u16x16(), b.as_u16x16()).as_m256i() }
2094}
2095
2096/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2097/// the packed minimum values.
2098///
2099/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
2100#[inline]
2101#[target_feature(enable = "avx2")]
2102#[cfg_attr(test, assert_instr(vpminud))]
2103#[stable(feature = "simd_x86", since = "1.27.0")]
2104#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2105pub const fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2106    unsafe { simd_imin(a.as_u32x8(), b.as_u32x8()).as_m256i() }
2107}
2108
2109/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2110/// the packed minimum values.
2111///
2112/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
2113#[inline]
2114#[target_feature(enable = "avx2")]
2115#[cfg_attr(test, assert_instr(vpminub))]
2116#[stable(feature = "simd_x86", since = "1.27.0")]
2117#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2118pub const fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2119    unsafe { simd_imin(a.as_u8x32(), b.as_u8x32()).as_m256i() }
2120}
2121
2122/// Creates mask from the most significant bit of each 8-bit element in `a`,
2123/// return the result.
2124///
2125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
2126#[inline]
2127#[target_feature(enable = "avx2")]
2128#[cfg_attr(test, assert_instr(vpmovmskb))]
2129#[stable(feature = "simd_x86", since = "1.27.0")]
2130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2131pub const fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2132    unsafe {
2133        let z = i8x32::ZERO;
2134        let m: i8x32 = simd_lt(a.as_i8x32(), z);
2135        simd_bitmask::<_, u32>(m) as i32
2136    }
2137}
2138
2139/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2140/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2141/// results in dst. Eight SADs are performed for each 128-bit lane using one
2142/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2143/// selected from `b` starting at on the offset specified in `imm8`. Eight
2144/// quadruplets are formed from sequential 8-bit integers selected from `a`
2145/// starting at the offset specified in `imm8`.
2146///
2147/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
2148#[inline]
2149#[target_feature(enable = "avx2")]
2150#[cfg_attr(test, assert_instr(vmpsadbw, IMM8 = 0))]
2151#[rustc_legacy_const_generics(2)]
2152#[stable(feature = "simd_x86", since = "1.27.0")]
2153pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2154    static_assert_uimm_bits!(IMM8, 8);
2155    unsafe { transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8 as i8)) }
2156}
2157
2158/// Multiplies the low 32-bit integers from each packed 64-bit element in
2159/// `a` and `b`
2160///
2161/// Returns the 64-bit results.
2162///
2163/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
2164#[inline]
2165#[target_feature(enable = "avx2")]
2166#[cfg_attr(test, assert_instr(vpmuldq))]
2167#[stable(feature = "simd_x86", since = "1.27.0")]
2168#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2169pub const fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2170    unsafe {
2171        let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
2172        let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
2173        transmute(simd_mul(a, b))
2174    }
2175}
2176
2177/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2178/// element in `a` and `b`
2179///
2180/// Returns the unsigned 64-bit results.
2181///
2182/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
2183#[inline]
2184#[target_feature(enable = "avx2")]
2185#[cfg_attr(test, assert_instr(vpmuludq))]
2186#[stable(feature = "simd_x86", since = "1.27.0")]
2187#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2188pub const fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2189    unsafe {
2190        let a = a.as_u64x4();
2191        let b = b.as_u64x4();
2192        let mask = u64x4::splat(u32::MAX as u64);
2193        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
2194    }
2195}
2196
2197/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2198/// intermediate 32-bit integers and returning the high 16 bits of the
2199/// intermediate integers.
2200///
2201/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
2202#[inline]
2203#[target_feature(enable = "avx2")]
2204#[cfg_attr(test, assert_instr(vpmulhw))]
2205#[stable(feature = "simd_x86", since = "1.27.0")]
2206#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2207pub const fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2208    unsafe {
2209        let a = simd_cast::<_, i32x16>(a.as_i16x16());
2210        let b = simd_cast::<_, i32x16>(b.as_i16x16());
2211        let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
2212        transmute(simd_cast::<i32x16, i16x16>(r))
2213    }
2214}
2215
2216/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2217/// intermediate 32-bit integers and returning the high 16 bits of the
2218/// intermediate integers.
2219///
2220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
2221#[inline]
2222#[target_feature(enable = "avx2")]
2223#[cfg_attr(test, assert_instr(vpmulhuw))]
2224#[stable(feature = "simd_x86", since = "1.27.0")]
2225#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2226pub const fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2227    unsafe {
2228        let a = simd_cast::<_, u32x16>(a.as_u16x16());
2229        let b = simd_cast::<_, u32x16>(b.as_u16x16());
2230        let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
2231        transmute(simd_cast::<u32x16, u16x16>(r))
2232    }
2233}
2234
2235/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2236/// intermediate 32-bit integers, and returns the low 16 bits of the
2237/// intermediate integers
2238///
2239/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
2240#[inline]
2241#[target_feature(enable = "avx2")]
2242#[cfg_attr(test, assert_instr(vpmullw))]
2243#[stable(feature = "simd_x86", since = "1.27.0")]
2244#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2245pub const fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2246    unsafe { transmute(simd_mul(a.as_i16x16(), b.as_i16x16())) }
2247}
2248
2249/// Multiplies the packed 32-bit integers in `a` and `b`, producing
2250/// intermediate 64-bit integers, and returns the low 32 bits of the
2251/// intermediate integers
2252///
2253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
2254#[inline]
2255#[target_feature(enable = "avx2")]
2256#[cfg_attr(test, assert_instr(vpmulld))]
2257#[stable(feature = "simd_x86", since = "1.27.0")]
2258#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2259pub const fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2260    unsafe { transmute(simd_mul(a.as_i32x8(), b.as_i32x8())) }
2261}
2262
2263/// Multiplies packed 16-bit integers in `a` and `b`, producing
2264/// intermediate signed 32-bit integers. Truncate each intermediate
2265/// integer to the 18 most significant bits, round by adding 1, and
2266/// return bits `[16:1]`.
2267///
2268/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
2269#[inline]
2270#[target_feature(enable = "avx2")]
2271#[cfg_attr(test, assert_instr(vpmulhrsw))]
2272#[stable(feature = "simd_x86", since = "1.27.0")]
2273pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2274    unsafe { transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16())) }
2275}
2276
2277/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2278/// and `b`
2279///
2280/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
2281#[inline]
2282#[target_feature(enable = "avx2")]
2283#[cfg_attr(test, assert_instr(vorps))]
2284#[stable(feature = "simd_x86", since = "1.27.0")]
2285#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2286pub const fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2287    unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
2288}
2289
2290/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2291/// using signed saturation
2292///
2293/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
2294#[inline]
2295#[target_feature(enable = "avx2")]
2296#[cfg_attr(test, assert_instr(vpacksswb))]
2297#[stable(feature = "simd_x86", since = "1.27.0")]
2298pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2299    unsafe { transmute(packsswb(a.as_i16x16(), b.as_i16x16())) }
2300}
2301
2302/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2303/// using signed saturation
2304///
2305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
2306#[inline]
2307#[target_feature(enable = "avx2")]
2308#[cfg_attr(test, assert_instr(vpackssdw))]
2309#[stable(feature = "simd_x86", since = "1.27.0")]
2310pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2311    unsafe { transmute(packssdw(a.as_i32x8(), b.as_i32x8())) }
2312}
2313
2314/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2315/// using unsigned saturation
2316///
2317/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
2318#[inline]
2319#[target_feature(enable = "avx2")]
2320#[cfg_attr(test, assert_instr(vpackuswb))]
2321#[stable(feature = "simd_x86", since = "1.27.0")]
2322pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2323    unsafe { transmute(packuswb(a.as_i16x16(), b.as_i16x16())) }
2324}
2325
2326/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2327/// using unsigned saturation
2328///
2329/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
2330#[inline]
2331#[target_feature(enable = "avx2")]
2332#[cfg_attr(test, assert_instr(vpackusdw))]
2333#[stable(feature = "simd_x86", since = "1.27.0")]
2334pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2335    unsafe { transmute(packusdw(a.as_i32x8(), b.as_i32x8())) }
2336}
2337
2338/// Permutes packed 32-bit integers from `a` according to the content of `b`.
2339///
2340/// The last 3 bits of each integer of `b` are used as addresses into the 8
2341/// integers of `a`.
2342///
2343/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
2344#[inline]
2345#[target_feature(enable = "avx2")]
2346#[cfg_attr(test, assert_instr(vpermps))]
2347#[stable(feature = "simd_x86", since = "1.27.0")]
2348pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2349    unsafe { transmute(permd(a.as_u32x8(), b.as_u32x8())) }
2350}
2351
2352/// Permutes 64-bit integers from `a` using control mask `imm8`.
2353///
2354/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
2355#[inline]
2356#[target_feature(enable = "avx2")]
2357#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 9))]
2358#[rustc_legacy_const_generics(1)]
2359#[stable(feature = "simd_x86", since = "1.27.0")]
2360#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2361pub const fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2362    static_assert_uimm_bits!(IMM8, 8);
2363    unsafe {
2364        let zero = i64x4::ZERO;
2365        let r: i64x4 = simd_shuffle!(
2366            a.as_i64x4(),
2367            zero,
2368            [
2369                IMM8 as u32 & 0b11,
2370                (IMM8 as u32 >> 2) & 0b11,
2371                (IMM8 as u32 >> 4) & 0b11,
2372                (IMM8 as u32 >> 6) & 0b11,
2373            ],
2374        );
2375        transmute(r)
2376    }
2377}
2378
2379/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2380///
2381/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
2382#[inline]
2383#[target_feature(enable = "avx2")]
2384#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 9))]
2385#[rustc_legacy_const_generics(2)]
2386#[stable(feature = "simd_x86", since = "1.27.0")]
2387#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2388pub const fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2389    static_assert_uimm_bits!(IMM8, 8);
2390    _mm256_permute2f128_si256::<IMM8>(a, b)
2391}
2392
2393/// Shuffles 64-bit floating-point elements in `a` across lanes using the
2394/// control in `imm8`.
2395///
2396/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
2397#[inline]
2398#[target_feature(enable = "avx2")]
2399#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 1))]
2400#[rustc_legacy_const_generics(1)]
2401#[stable(feature = "simd_x86", since = "1.27.0")]
2402#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2403pub const fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
2404    static_assert_uimm_bits!(IMM8, 8);
2405    unsafe {
2406        simd_shuffle!(
2407            a,
2408            _mm256_undefined_pd(),
2409            [
2410                IMM8 as u32 & 0b11,
2411                (IMM8 as u32 >> 2) & 0b11,
2412                (IMM8 as u32 >> 4) & 0b11,
2413                (IMM8 as u32 >> 6) & 0b11,
2414            ],
2415        )
2416    }
2417}
2418
2419/// Shuffles eight 32-bit floating-point elements in `a` across lanes using
2420/// the corresponding 32-bit integer index in `idx`.
2421///
2422/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
2423#[inline]
2424#[target_feature(enable = "avx2")]
2425#[cfg_attr(test, assert_instr(vpermps))]
2426#[stable(feature = "simd_x86", since = "1.27.0")]
2427pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2428    unsafe { permps(a, idx.as_i32x8()) }
2429}
2430
2431/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2432/// and `b`, then horizontally sum each consecutive 8 differences to
2433/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2434/// integers in the low 16 bits of the 64-bit return value
2435///
2436/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
2437#[inline]
2438#[target_feature(enable = "avx2")]
2439#[cfg_attr(test, assert_instr(vpsadbw))]
2440#[stable(feature = "simd_x86", since = "1.27.0")]
2441pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2442    unsafe { transmute(psadbw(a.as_u8x32(), b.as_u8x32())) }
2443}
2444
2445/// Shuffles bytes from `a` according to the content of `b`.
2446///
2447/// For each of the 128-bit low and high halves of the vectors, the last
2448/// 4 bits of each byte of `b` are used as addresses into the respective
2449/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
2450///
2451/// In addition, if the highest significant bit of a byte of `b` is set, the
2452/// respective destination byte is set to 0.
2453///
2454/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2455/// equivalent to:
2456///
2457/// ```
2458/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2459///     let mut r = [0; 32];
2460///     for i in 0..16 {
2461///         // if the most significant bit of b is set,
2462///         // then the destination byte is set to 0.
2463///         if b[i] & 0x80 == 0u8 {
2464///             r[i] = a[(b[i] % 16) as usize];
2465///         }
2466///         if b[i + 16] & 0x80 == 0u8 {
2467///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2468///         }
2469///     }
2470///     r
2471/// }
2472/// ```
2473///
2474/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
2475#[inline]
2476#[target_feature(enable = "avx2")]
2477#[cfg_attr(test, assert_instr(vpshufb))]
2478#[stable(feature = "simd_x86", since = "1.27.0")]
2479pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2480    unsafe { transmute(pshufb(a.as_u8x32(), b.as_u8x32())) }
2481}
2482
2483/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2484/// `imm8`.
2485///
2486/// ```rust
2487/// #[cfg(target_arch = "x86")]
2488/// use std::arch::x86::*;
2489/// #[cfg(target_arch = "x86_64")]
2490/// use std::arch::x86_64::*;
2491///
2492/// # fn main() {
2493/// #     if is_x86_feature_detected!("avx2") {
2494/// #         #[target_feature(enable = "avx2")]
2495/// #         unsafe fn worker() {
2496/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2497///
2498/// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2499/// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2500///
2501/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2502/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2503///
2504/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2505/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2506/// #         }
2507/// #         unsafe { worker(); }
2508/// #     }
2509/// # }
2510/// ```
2511///
2512/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
2513#[inline]
2514#[target_feature(enable = "avx2")]
2515#[cfg_attr(test, assert_instr(vshufps, MASK = 9))]
2516#[rustc_legacy_const_generics(1)]
2517#[stable(feature = "simd_x86", since = "1.27.0")]
2518#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2519pub const fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
2520    static_assert_uimm_bits!(MASK, 8);
2521    unsafe {
2522        let r: i32x8 = simd_shuffle!(
2523            a.as_i32x8(),
2524            a.as_i32x8(),
2525            [
2526                MASK as u32 & 0b11,
2527                (MASK as u32 >> 2) & 0b11,
2528                (MASK as u32 >> 4) & 0b11,
2529                (MASK as u32 >> 6) & 0b11,
2530                (MASK as u32 & 0b11) + 4,
2531                ((MASK as u32 >> 2) & 0b11) + 4,
2532                ((MASK as u32 >> 4) & 0b11) + 4,
2533                ((MASK as u32 >> 6) & 0b11) + 4,
2534            ],
2535        );
2536        transmute(r)
2537    }
2538}
2539
2540/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2541/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2542/// to the output.
2543///
2544/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
2545#[inline]
2546#[target_feature(enable = "avx2")]
2547#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 9))]
2548#[rustc_legacy_const_generics(1)]
2549#[stable(feature = "simd_x86", since = "1.27.0")]
2550#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2551pub const fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2552    static_assert_uimm_bits!(IMM8, 8);
2553    unsafe {
2554        let a = a.as_i16x16();
2555        let r: i16x16 = simd_shuffle!(
2556            a,
2557            a,
2558            [
2559                0,
2560                1,
2561                2,
2562                3,
2563                4 + (IMM8 as u32 & 0b11),
2564                4 + ((IMM8 as u32 >> 2) & 0b11),
2565                4 + ((IMM8 as u32 >> 4) & 0b11),
2566                4 + ((IMM8 as u32 >> 6) & 0b11),
2567                8,
2568                9,
2569                10,
2570                11,
2571                12 + (IMM8 as u32 & 0b11),
2572                12 + ((IMM8 as u32 >> 2) & 0b11),
2573                12 + ((IMM8 as u32 >> 4) & 0b11),
2574                12 + ((IMM8 as u32 >> 6) & 0b11),
2575            ],
2576        );
2577        transmute(r)
2578    }
2579}
2580
2581/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2582/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2583/// to the output.
2584///
2585/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
2586#[inline]
2587#[target_feature(enable = "avx2")]
2588#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 9))]
2589#[rustc_legacy_const_generics(1)]
2590#[stable(feature = "simd_x86", since = "1.27.0")]
2591#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2592pub const fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2593    static_assert_uimm_bits!(IMM8, 8);
2594    unsafe {
2595        let a = a.as_i16x16();
2596        let r: i16x16 = simd_shuffle!(
2597            a,
2598            a,
2599            [
2600                0 + (IMM8 as u32 & 0b11),
2601                0 + ((IMM8 as u32 >> 2) & 0b11),
2602                0 + ((IMM8 as u32 >> 4) & 0b11),
2603                0 + ((IMM8 as u32 >> 6) & 0b11),
2604                4,
2605                5,
2606                6,
2607                7,
2608                8 + (IMM8 as u32 & 0b11),
2609                8 + ((IMM8 as u32 >> 2) & 0b11),
2610                8 + ((IMM8 as u32 >> 4) & 0b11),
2611                8 + ((IMM8 as u32 >> 6) & 0b11),
2612                12,
2613                13,
2614                14,
2615                15,
2616            ],
2617        );
2618        transmute(r)
2619    }
2620}
2621
2622/// Negates packed 16-bit integers in `a` when the corresponding signed
2623/// 16-bit integer in `b` is negative, and returns the results.
2624/// Results are zeroed out when the corresponding element in `b` is zero.
2625///
2626/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
2627#[inline]
2628#[target_feature(enable = "avx2")]
2629#[cfg_attr(test, assert_instr(vpsignw))]
2630#[stable(feature = "simd_x86", since = "1.27.0")]
2631pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2632    unsafe { transmute(psignw(a.as_i16x16(), b.as_i16x16())) }
2633}
2634
2635/// Negates packed 32-bit integers in `a` when the corresponding signed
2636/// 32-bit integer in `b` is negative, and returns the results.
2637/// Results are zeroed out when the corresponding element in `b` is zero.
2638///
2639/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
2640#[inline]
2641#[target_feature(enable = "avx2")]
2642#[cfg_attr(test, assert_instr(vpsignd))]
2643#[stable(feature = "simd_x86", since = "1.27.0")]
2644pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2645    unsafe { transmute(psignd(a.as_i32x8(), b.as_i32x8())) }
2646}
2647
2648/// Negates packed 8-bit integers in `a` when the corresponding signed
2649/// 8-bit integer in `b` is negative, and returns the results.
2650/// Results are zeroed out when the corresponding element in `b` is zero.
2651///
2652/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
2653#[inline]
2654#[target_feature(enable = "avx2")]
2655#[cfg_attr(test, assert_instr(vpsignb))]
2656#[stable(feature = "simd_x86", since = "1.27.0")]
2657pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2658    unsafe { transmute(psignb(a.as_i8x32(), b.as_i8x32())) }
2659}
2660
2661/// Shifts packed 16-bit integers in `a` left by `count` while
2662/// shifting in zeros, and returns the result
2663///
2664/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
2665#[inline]
2666#[target_feature(enable = "avx2")]
2667#[cfg_attr(test, assert_instr(vpsllw))]
2668#[stable(feature = "simd_x86", since = "1.27.0")]
2669pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2670    unsafe { transmute(psllw(a.as_i16x16(), count.as_i16x8())) }
2671}
2672
2673/// Shifts packed 32-bit integers in `a` left by `count` while
2674/// shifting in zeros, and returns the result
2675///
2676/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
2677#[inline]
2678#[target_feature(enable = "avx2")]
2679#[cfg_attr(test, assert_instr(vpslld))]
2680#[stable(feature = "simd_x86", since = "1.27.0")]
2681pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2682    unsafe { transmute(pslld(a.as_i32x8(), count.as_i32x4())) }
2683}
2684
2685/// Shifts packed 64-bit integers in `a` left by `count` while
2686/// shifting in zeros, and returns the result
2687///
2688/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
2689#[inline]
2690#[target_feature(enable = "avx2")]
2691#[cfg_attr(test, assert_instr(vpsllq))]
2692#[stable(feature = "simd_x86", since = "1.27.0")]
2693pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2694    unsafe { transmute(psllq(a.as_i64x4(), count.as_i64x2())) }
2695}
2696
2697/// Shifts packed 16-bit integers in `a` left by `IMM8` while
2698/// shifting in zeros, return the results;
2699///
2700/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
2701#[inline]
2702#[target_feature(enable = "avx2")]
2703#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 7))]
2704#[rustc_legacy_const_generics(1)]
2705#[stable(feature = "simd_x86", since = "1.27.0")]
2706#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2707pub const fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2708    static_assert_uimm_bits!(IMM8, 8);
2709    unsafe {
2710        if IMM8 >= 16 {
2711            _mm256_setzero_si256()
2712        } else {
2713            transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
2714        }
2715    }
2716}
2717
2718/// Shifts packed 32-bit integers in `a` left by `IMM8` while
2719/// shifting in zeros, return the results;
2720///
2721/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
2722#[inline]
2723#[target_feature(enable = "avx2")]
2724#[cfg_attr(test, assert_instr(vpslld, IMM8 = 7))]
2725#[rustc_legacy_const_generics(1)]
2726#[stable(feature = "simd_x86", since = "1.27.0")]
2727#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2728pub const fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2729    unsafe {
2730        static_assert_uimm_bits!(IMM8, 8);
2731        if IMM8 >= 32 {
2732            _mm256_setzero_si256()
2733        } else {
2734            transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
2735        }
2736    }
2737}
2738
2739/// Shifts packed 64-bit integers in `a` left by `IMM8` while
2740/// shifting in zeros, return the results;
2741///
2742/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
2743#[inline]
2744#[target_feature(enable = "avx2")]
2745#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 7))]
2746#[rustc_legacy_const_generics(1)]
2747#[stable(feature = "simd_x86", since = "1.27.0")]
2748#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2749pub const fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2750    unsafe {
2751        static_assert_uimm_bits!(IMM8, 8);
2752        if IMM8 >= 64 {
2753            _mm256_setzero_si256()
2754        } else {
2755            transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
2756        }
2757    }
2758}
2759
2760/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2761///
2762/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
2763#[inline]
2764#[target_feature(enable = "avx2")]
2765#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2766#[rustc_legacy_const_generics(1)]
2767#[stable(feature = "simd_x86", since = "1.27.0")]
2768#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2769pub const fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2770    static_assert_uimm_bits!(IMM8, 8);
2771    _mm256_bslli_epi128::<IMM8>(a)
2772}
2773
2774/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2775///
2776/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
2777#[inline]
2778#[target_feature(enable = "avx2")]
2779#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2780#[rustc_legacy_const_generics(1)]
2781#[stable(feature = "simd_x86", since = "1.27.0")]
2782#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2783pub const fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2784    static_assert_uimm_bits!(IMM8, 8);
2785    const fn mask(shift: i32, i: u32) -> u32 {
2786        let shift = shift as u32 & 0xff;
2787        if shift > 15 || i % 16 < shift {
2788            0
2789        } else {
2790            32 + (i - shift)
2791        }
2792    }
2793    unsafe {
2794        let a = a.as_i8x32();
2795        let r: i8x32 = simd_shuffle!(
2796            i8x32::ZERO,
2797            a,
2798            [
2799                mask(IMM8, 0),
2800                mask(IMM8, 1),
2801                mask(IMM8, 2),
2802                mask(IMM8, 3),
2803                mask(IMM8, 4),
2804                mask(IMM8, 5),
2805                mask(IMM8, 6),
2806                mask(IMM8, 7),
2807                mask(IMM8, 8),
2808                mask(IMM8, 9),
2809                mask(IMM8, 10),
2810                mask(IMM8, 11),
2811                mask(IMM8, 12),
2812                mask(IMM8, 13),
2813                mask(IMM8, 14),
2814                mask(IMM8, 15),
2815                mask(IMM8, 16),
2816                mask(IMM8, 17),
2817                mask(IMM8, 18),
2818                mask(IMM8, 19),
2819                mask(IMM8, 20),
2820                mask(IMM8, 21),
2821                mask(IMM8, 22),
2822                mask(IMM8, 23),
2823                mask(IMM8, 24),
2824                mask(IMM8, 25),
2825                mask(IMM8, 26),
2826                mask(IMM8, 27),
2827                mask(IMM8, 28),
2828                mask(IMM8, 29),
2829                mask(IMM8, 30),
2830                mask(IMM8, 31),
2831            ],
2832        );
2833        transmute(r)
2834    }
2835}
2836
2837/// Shifts packed 32-bit integers in `a` left by the amount
2838/// specified by the corresponding element in `count` while
2839/// shifting in zeros, and returns the result.
2840///
2841/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
2842#[inline]
2843#[target_feature(enable = "avx2")]
2844#[cfg_attr(test, assert_instr(vpsllvd))]
2845#[stable(feature = "simd_x86", since = "1.27.0")]
2846#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2847pub const fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
2848    unsafe {
2849        let count = count.as_u32x4();
2850        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
2851        let count = simd_select(no_overflow, count, u32x4::ZERO);
2852        simd_select(no_overflow, simd_shl(a.as_u32x4(), count), u32x4::ZERO).as_m128i()
2853    }
2854}
2855
2856/// Shifts packed 32-bit integers in `a` left by the amount
2857/// specified by the corresponding element in `count` while
2858/// shifting in zeros, and returns the result.
2859///
2860/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
2861#[inline]
2862#[target_feature(enable = "avx2")]
2863#[cfg_attr(test, assert_instr(vpsllvd))]
2864#[stable(feature = "simd_x86", since = "1.27.0")]
2865#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2866pub const fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
2867    unsafe {
2868        let count = count.as_u32x8();
2869        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
2870        let count = simd_select(no_overflow, count, u32x8::ZERO);
2871        simd_select(no_overflow, simd_shl(a.as_u32x8(), count), u32x8::ZERO).as_m256i()
2872    }
2873}
2874
2875/// Shifts packed 64-bit integers in `a` left by the amount
2876/// specified by the corresponding element in `count` while
2877/// shifting in zeros, and returns the result.
2878///
2879/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
2880#[inline]
2881#[target_feature(enable = "avx2")]
2882#[cfg_attr(test, assert_instr(vpsllvq))]
2883#[stable(feature = "simd_x86", since = "1.27.0")]
2884#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2885pub const fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
2886    unsafe {
2887        let count = count.as_u64x2();
2888        let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64));
2889        let count = simd_select(no_overflow, count, u64x2::ZERO);
2890        simd_select(no_overflow, simd_shl(a.as_u64x2(), count), u64x2::ZERO).as_m128i()
2891    }
2892}
2893
2894/// Shifts packed 64-bit integers in `a` left by the amount
2895/// specified by the corresponding element in `count` while
2896/// shifting in zeros, and returns the result.
2897///
2898/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
2899#[inline]
2900#[target_feature(enable = "avx2")]
2901#[cfg_attr(test, assert_instr(vpsllvq))]
2902#[stable(feature = "simd_x86", since = "1.27.0")]
2903#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2904pub const fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
2905    unsafe {
2906        let count = count.as_u64x4();
2907        let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64));
2908        let count = simd_select(no_overflow, count, u64x4::ZERO);
2909        simd_select(no_overflow, simd_shl(a.as_u64x4(), count), u64x4::ZERO).as_m256i()
2910    }
2911}
2912
2913/// Shifts packed 16-bit integers in `a` right by `count` while
2914/// shifting in sign bits.
2915///
2916/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
2917#[inline]
2918#[target_feature(enable = "avx2")]
2919#[cfg_attr(test, assert_instr(vpsraw))]
2920#[stable(feature = "simd_x86", since = "1.27.0")]
2921pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
2922    unsafe { transmute(psraw(a.as_i16x16(), count.as_i16x8())) }
2923}
2924
2925/// Shifts packed 32-bit integers in `a` right by `count` while
2926/// shifting in sign bits.
2927///
2928/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
2929#[inline]
2930#[target_feature(enable = "avx2")]
2931#[cfg_attr(test, assert_instr(vpsrad))]
2932#[stable(feature = "simd_x86", since = "1.27.0")]
2933pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
2934    unsafe { transmute(psrad(a.as_i32x8(), count.as_i32x4())) }
2935}
2936
2937/// Shifts packed 16-bit integers in `a` right by `IMM8` while
2938/// shifting in sign bits.
2939///
2940/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
2941#[inline]
2942#[target_feature(enable = "avx2")]
2943#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 7))]
2944#[rustc_legacy_const_generics(1)]
2945#[stable(feature = "simd_x86", since = "1.27.0")]
2946#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2947pub const fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2948    static_assert_uimm_bits!(IMM8, 8);
2949    unsafe { transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16))) }
2950}
2951
2952/// Shifts packed 32-bit integers in `a` right by `IMM8` while
2953/// shifting in sign bits.
2954///
2955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
2956#[inline]
2957#[target_feature(enable = "avx2")]
2958#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 7))]
2959#[rustc_legacy_const_generics(1)]
2960#[stable(feature = "simd_x86", since = "1.27.0")]
2961#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2962pub const fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2963    static_assert_uimm_bits!(IMM8, 8);
2964    unsafe { transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31)))) }
2965}
2966
2967/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2968/// corresponding element in `count` while shifting in sign bits.
2969///
2970/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
2971#[inline]
2972#[target_feature(enable = "avx2")]
2973#[cfg_attr(test, assert_instr(vpsravd))]
2974#[stable(feature = "simd_x86", since = "1.27.0")]
2975#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2976pub const fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
2977    unsafe {
2978        let count = count.as_u32x4();
2979        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
2980        let count = simd_select(no_overflow, transmute(count), i32x4::splat(31));
2981        simd_shr(a.as_i32x4(), count).as_m128i()
2982    }
2983}
2984
2985/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2986/// corresponding element in `count` while shifting in sign bits.
2987///
2988/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
2989#[inline]
2990#[target_feature(enable = "avx2")]
2991#[cfg_attr(test, assert_instr(vpsravd))]
2992#[stable(feature = "simd_x86", since = "1.27.0")]
2993#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2994pub const fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
2995    unsafe {
2996        let count = count.as_u32x8();
2997        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
2998        let count = simd_select(no_overflow, transmute(count), i32x8::splat(31));
2999        simd_shr(a.as_i32x8(), count).as_m256i()
3000    }
3001}
3002
3003/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3004///
3005/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
3006#[inline]
3007#[target_feature(enable = "avx2")]
3008#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
3009#[rustc_legacy_const_generics(1)]
3010#[stable(feature = "simd_x86", since = "1.27.0")]
3011#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3012pub const fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
3013    static_assert_uimm_bits!(IMM8, 8);
3014    _mm256_bsrli_epi128::<IMM8>(a)
3015}
3016
3017/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3018///
3019/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
3020#[inline]
3021#[target_feature(enable = "avx2")]
3022#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
3023#[rustc_legacy_const_generics(1)]
3024#[stable(feature = "simd_x86", since = "1.27.0")]
3025#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3026pub const fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
3027    static_assert_uimm_bits!(IMM8, 8);
3028    const fn mask(shift: i32, i: u32) -> u32 {
3029        let shift = shift as u32 & 0xff;
3030        if shift > 15 || (15 - (i % 16)) < shift {
3031            0
3032        } else {
3033            32 + (i + shift)
3034        }
3035    }
3036    unsafe {
3037        let a = a.as_i8x32();
3038        let r: i8x32 = simd_shuffle!(
3039            i8x32::ZERO,
3040            a,
3041            [
3042                mask(IMM8, 0),
3043                mask(IMM8, 1),
3044                mask(IMM8, 2),
3045                mask(IMM8, 3),
3046                mask(IMM8, 4),
3047                mask(IMM8, 5),
3048                mask(IMM8, 6),
3049                mask(IMM8, 7),
3050                mask(IMM8, 8),
3051                mask(IMM8, 9),
3052                mask(IMM8, 10),
3053                mask(IMM8, 11),
3054                mask(IMM8, 12),
3055                mask(IMM8, 13),
3056                mask(IMM8, 14),
3057                mask(IMM8, 15),
3058                mask(IMM8, 16),
3059                mask(IMM8, 17),
3060                mask(IMM8, 18),
3061                mask(IMM8, 19),
3062                mask(IMM8, 20),
3063                mask(IMM8, 21),
3064                mask(IMM8, 22),
3065                mask(IMM8, 23),
3066                mask(IMM8, 24),
3067                mask(IMM8, 25),
3068                mask(IMM8, 26),
3069                mask(IMM8, 27),
3070                mask(IMM8, 28),
3071                mask(IMM8, 29),
3072                mask(IMM8, 30),
3073                mask(IMM8, 31),
3074            ],
3075        );
3076        transmute(r)
3077    }
3078}
3079
3080/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
3081/// zeros.
3082///
3083/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
3084#[inline]
3085#[target_feature(enable = "avx2")]
3086#[cfg_attr(test, assert_instr(vpsrlw))]
3087#[stable(feature = "simd_x86", since = "1.27.0")]
3088pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
3089    unsafe { transmute(psrlw(a.as_i16x16(), count.as_i16x8())) }
3090}
3091
3092/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
3093/// zeros.
3094///
3095/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
3096#[inline]
3097#[target_feature(enable = "avx2")]
3098#[cfg_attr(test, assert_instr(vpsrld))]
3099#[stable(feature = "simd_x86", since = "1.27.0")]
3100pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
3101    unsafe { transmute(psrld(a.as_i32x8(), count.as_i32x4())) }
3102}
3103
3104/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
3105/// zeros.
3106///
3107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
3108#[inline]
3109#[target_feature(enable = "avx2")]
3110#[cfg_attr(test, assert_instr(vpsrlq))]
3111#[stable(feature = "simd_x86", since = "1.27.0")]
3112pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3113    unsafe { transmute(psrlq(a.as_i64x4(), count.as_i64x2())) }
3114}
3115
3116/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
3117/// zeros
3118///
3119/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
3120#[inline]
3121#[target_feature(enable = "avx2")]
3122#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 7))]
3123#[rustc_legacy_const_generics(1)]
3124#[stable(feature = "simd_x86", since = "1.27.0")]
3125#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3126pub const fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
3127    static_assert_uimm_bits!(IMM8, 8);
3128    unsafe {
3129        if IMM8 >= 16 {
3130            _mm256_setzero_si256()
3131        } else {
3132            transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
3133        }
3134    }
3135}
3136
3137/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
3138/// zeros
3139///
3140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
3141#[inline]
3142#[target_feature(enable = "avx2")]
3143#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 7))]
3144#[rustc_legacy_const_generics(1)]
3145#[stable(feature = "simd_x86", since = "1.27.0")]
3146#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3147pub const fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
3148    static_assert_uimm_bits!(IMM8, 8);
3149    unsafe {
3150        if IMM8 >= 32 {
3151            _mm256_setzero_si256()
3152        } else {
3153            transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
3154        }
3155    }
3156}
3157
3158/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
3159/// zeros
3160///
3161/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
3162#[inline]
3163#[target_feature(enable = "avx2")]
3164#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 7))]
3165#[rustc_legacy_const_generics(1)]
3166#[stable(feature = "simd_x86", since = "1.27.0")]
3167#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3168pub const fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
3169    static_assert_uimm_bits!(IMM8, 8);
3170    unsafe {
3171        if IMM8 >= 64 {
3172            _mm256_setzero_si256()
3173        } else {
3174            transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
3175        }
3176    }
3177}
3178
3179/// Shifts packed 32-bit integers in `a` right by the amount specified by
3180/// the corresponding element in `count` while shifting in zeros,
3181///
3182/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
3183#[inline]
3184#[target_feature(enable = "avx2")]
3185#[cfg_attr(test, assert_instr(vpsrlvd))]
3186#[stable(feature = "simd_x86", since = "1.27.0")]
3187#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3188pub const fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3189    unsafe {
3190        let count = count.as_u32x4();
3191        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
3192        let count = simd_select(no_overflow, count, u32x4::ZERO);
3193        simd_select(no_overflow, simd_shr(a.as_u32x4(), count), u32x4::ZERO).as_m128i()
3194    }
3195}
3196
3197/// Shifts packed 32-bit integers in `a` right by the amount specified by
3198/// the corresponding element in `count` while shifting in zeros,
3199///
3200/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
3201#[inline]
3202#[target_feature(enable = "avx2")]
3203#[cfg_attr(test, assert_instr(vpsrlvd))]
3204#[stable(feature = "simd_x86", since = "1.27.0")]
3205#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3206pub const fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3207    unsafe {
3208        let count = count.as_u32x8();
3209        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
3210        let count = simd_select(no_overflow, count, u32x8::ZERO);
3211        simd_select(no_overflow, simd_shr(a.as_u32x8(), count), u32x8::ZERO).as_m256i()
3212    }
3213}
3214
3215/// Shifts packed 64-bit integers in `a` right by the amount specified by
3216/// the corresponding element in `count` while shifting in zeros,
3217///
3218/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
3219#[inline]
3220#[target_feature(enable = "avx2")]
3221#[cfg_attr(test, assert_instr(vpsrlvq))]
3222#[stable(feature = "simd_x86", since = "1.27.0")]
3223#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3224pub const fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3225    unsafe {
3226        let count = count.as_u64x2();
3227        let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64));
3228        let count = simd_select(no_overflow, count, u64x2::ZERO);
3229        simd_select(no_overflow, simd_shr(a.as_u64x2(), count), u64x2::ZERO).as_m128i()
3230    }
3231}
3232
3233/// Shifts packed 64-bit integers in `a` right by the amount specified by
3234/// the corresponding element in `count` while shifting in zeros,
3235///
3236/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
3237#[inline]
3238#[target_feature(enable = "avx2")]
3239#[cfg_attr(test, assert_instr(vpsrlvq))]
3240#[stable(feature = "simd_x86", since = "1.27.0")]
3241#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3242pub const fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3243    unsafe {
3244        let count = count.as_u64x4();
3245        let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64));
3246        let count = simd_select(no_overflow, count, u64x4::ZERO);
3247        simd_select(no_overflow, simd_shr(a.as_u64x4(), count), u64x4::ZERO).as_m256i()
3248    }
3249}
3250
3251/// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
3252/// must be aligned on a 32-byte boundary or a general-protection exception may be generated. To
3253/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
3254///
3255/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_load_si256)
3256#[inline]
3257#[target_feature(enable = "avx2")]
3258#[cfg_attr(test, assert_instr(vmovntdqa))]
3259#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3260pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
3261    let dst: __m256i;
3262    crate::arch::asm!(
3263        vpl!("vmovntdqa {a}"),
3264        a = out(ymm_reg) dst,
3265        p = in(reg) mem_addr,
3266        options(pure, readonly, nostack, preserves_flags),
3267    );
3268    dst
3269}
3270
3271/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3272///
3273/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
3274#[inline]
3275#[target_feature(enable = "avx2")]
3276#[cfg_attr(test, assert_instr(vpsubw))]
3277#[stable(feature = "simd_x86", since = "1.27.0")]
3278#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3279pub const fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3280    unsafe { transmute(simd_sub(a.as_i16x16(), b.as_i16x16())) }
3281}
3282
3283/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
3284///
3285/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
3286#[inline]
3287#[target_feature(enable = "avx2")]
3288#[cfg_attr(test, assert_instr(vpsubd))]
3289#[stable(feature = "simd_x86", since = "1.27.0")]
3290#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3291pub const fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3292    unsafe { transmute(simd_sub(a.as_i32x8(), b.as_i32x8())) }
3293}
3294
3295/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
3296///
3297/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
3298#[inline]
3299#[target_feature(enable = "avx2")]
3300#[cfg_attr(test, assert_instr(vpsubq))]
3301#[stable(feature = "simd_x86", since = "1.27.0")]
3302#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3303pub const fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3304    unsafe { transmute(simd_sub(a.as_i64x4(), b.as_i64x4())) }
3305}
3306
3307/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
3308///
3309/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
3310#[inline]
3311#[target_feature(enable = "avx2")]
3312#[cfg_attr(test, assert_instr(vpsubb))]
3313#[stable(feature = "simd_x86", since = "1.27.0")]
3314#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3315pub const fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3316    unsafe { transmute(simd_sub(a.as_i8x32(), b.as_i8x32())) }
3317}
3318
3319/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3320/// `a` using saturation.
3321///
3322/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
3323#[inline]
3324#[target_feature(enable = "avx2")]
3325#[cfg_attr(test, assert_instr(vpsubsw))]
3326#[stable(feature = "simd_x86", since = "1.27.0")]
3327#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3328pub const fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3329    unsafe { transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16())) }
3330}
3331
3332/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3333/// `a` using saturation.
3334///
3335/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
3336#[inline]
3337#[target_feature(enable = "avx2")]
3338#[cfg_attr(test, assert_instr(vpsubsb))]
3339#[stable(feature = "simd_x86", since = "1.27.0")]
3340#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3341pub const fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3342    unsafe { transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32())) }
3343}
3344
3345/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3346/// integers in `a` using saturation.
3347///
3348/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
3349#[inline]
3350#[target_feature(enable = "avx2")]
3351#[cfg_attr(test, assert_instr(vpsubusw))]
3352#[stable(feature = "simd_x86", since = "1.27.0")]
3353#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3354pub const fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3355    unsafe { transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16())) }
3356}
3357
3358/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3359/// integers in `a` using saturation.
3360///
3361/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
3362#[inline]
3363#[target_feature(enable = "avx2")]
3364#[cfg_attr(test, assert_instr(vpsubusb))]
3365#[stable(feature = "simd_x86", since = "1.27.0")]
3366#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3367pub const fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3368    unsafe { transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32())) }
3369}
3370
3371/// Unpacks and interleave 8-bit integers from the high half of each
3372/// 128-bit lane in `a` and `b`.
3373///
3374/// ```rust
3375/// #[cfg(target_arch = "x86")]
3376/// use std::arch::x86::*;
3377/// #[cfg(target_arch = "x86_64")]
3378/// use std::arch::x86_64::*;
3379///
3380/// # fn main() {
3381/// #     if is_x86_feature_detected!("avx2") {
3382/// #         #[target_feature(enable = "avx2")]
3383/// #         unsafe fn worker() {
3384/// let a = _mm256_setr_epi8(
3385///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3386///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3387/// );
3388/// let b = _mm256_setr_epi8(
3389///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3390///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3391///     -30, -31,
3392/// );
3393///
3394/// let c = _mm256_unpackhi_epi8(a, b);
3395///
3396/// let expected = _mm256_setr_epi8(
3397///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3398///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3399///     -31,
3400/// );
3401/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3402///
3403/// #         }
3404/// #         unsafe { worker(); }
3405/// #     }
3406/// # }
3407/// ```
3408///
3409/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
3410#[inline]
3411#[target_feature(enable = "avx2")]
3412#[cfg_attr(test, assert_instr(vpunpckhbw))]
3413#[stable(feature = "simd_x86", since = "1.27.0")]
3414#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3415pub const fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3416    unsafe {
3417        #[rustfmt::skip]
3418        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3419                8, 40, 9, 41, 10, 42, 11, 43,
3420                12, 44, 13, 45, 14, 46, 15, 47,
3421                24, 56, 25, 57, 26, 58, 27, 59,
3422                28, 60, 29, 61, 30, 62, 31, 63,
3423        ]);
3424        transmute(r)
3425    }
3426}
3427
3428/// Unpacks and interleave 8-bit integers from the low half of each
3429/// 128-bit lane of `a` and `b`.
3430///
3431/// ```rust
3432/// #[cfg(target_arch = "x86")]
3433/// use std::arch::x86::*;
3434/// #[cfg(target_arch = "x86_64")]
3435/// use std::arch::x86_64::*;
3436///
3437/// # fn main() {
3438/// #     if is_x86_feature_detected!("avx2") {
3439/// #         #[target_feature(enable = "avx2")]
3440/// #         unsafe fn worker() {
3441/// let a = _mm256_setr_epi8(
3442///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3443///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3444/// );
3445/// let b = _mm256_setr_epi8(
3446///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3447///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3448///     -30, -31,
3449/// );
3450///
3451/// let c = _mm256_unpacklo_epi8(a, b);
3452///
3453/// let expected = _mm256_setr_epi8(
3454///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3455///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3456/// );
3457/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3458///
3459/// #         }
3460/// #         unsafe { worker(); }
3461/// #     }
3462/// # }
3463/// ```
3464///
3465/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
3466#[inline]
3467#[target_feature(enable = "avx2")]
3468#[cfg_attr(test, assert_instr(vpunpcklbw))]
3469#[stable(feature = "simd_x86", since = "1.27.0")]
3470#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3471pub const fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3472    unsafe {
3473        #[rustfmt::skip]
3474        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3475            0, 32, 1, 33, 2, 34, 3, 35,
3476            4, 36, 5, 37, 6, 38, 7, 39,
3477            16, 48, 17, 49, 18, 50, 19, 51,
3478            20, 52, 21, 53, 22, 54, 23, 55,
3479        ]);
3480        transmute(r)
3481    }
3482}
3483
3484/// Unpacks and interleave 16-bit integers from the high half of each
3485/// 128-bit lane of `a` and `b`.
3486///
3487/// ```rust
3488/// #[cfg(target_arch = "x86")]
3489/// use std::arch::x86::*;
3490/// #[cfg(target_arch = "x86_64")]
3491/// use std::arch::x86_64::*;
3492///
3493/// # fn main() {
3494/// #     if is_x86_feature_detected!("avx2") {
3495/// #         #[target_feature(enable = "avx2")]
3496/// #         unsafe fn worker() {
3497/// let a = _mm256_setr_epi16(
3498///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3499/// );
3500/// let b = _mm256_setr_epi16(
3501///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3502/// );
3503///
3504/// let c = _mm256_unpackhi_epi16(a, b);
3505///
3506/// let expected = _mm256_setr_epi16(
3507///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3508/// );
3509/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3510///
3511/// #         }
3512/// #         unsafe { worker(); }
3513/// #     }
3514/// # }
3515/// ```
3516///
3517/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
3518#[inline]
3519#[target_feature(enable = "avx2")]
3520#[cfg_attr(test, assert_instr(vpunpckhwd))]
3521#[stable(feature = "simd_x86", since = "1.27.0")]
3522#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3523pub const fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3524    unsafe {
3525        let r: i16x16 = simd_shuffle!(
3526            a.as_i16x16(),
3527            b.as_i16x16(),
3528            [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3529        );
3530        transmute(r)
3531    }
3532}
3533
3534/// Unpacks and interleave 16-bit integers from the low half of each
3535/// 128-bit lane of `a` and `b`.
3536///
3537/// ```rust
3538/// #[cfg(target_arch = "x86")]
3539/// use std::arch::x86::*;
3540/// #[cfg(target_arch = "x86_64")]
3541/// use std::arch::x86_64::*;
3542///
3543/// # fn main() {
3544/// #     if is_x86_feature_detected!("avx2") {
3545/// #         #[target_feature(enable = "avx2")]
3546/// #         unsafe fn worker() {
3547///
3548/// let a = _mm256_setr_epi16(
3549///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3550/// );
3551/// let b = _mm256_setr_epi16(
3552///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3553/// );
3554///
3555/// let c = _mm256_unpacklo_epi16(a, b);
3556///
3557/// let expected = _mm256_setr_epi16(
3558///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3559/// );
3560/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3561///
3562/// #         }
3563/// #         unsafe { worker(); }
3564/// #     }
3565/// # }
3566/// ```
3567///
3568/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
3569#[inline]
3570#[target_feature(enable = "avx2")]
3571#[cfg_attr(test, assert_instr(vpunpcklwd))]
3572#[stable(feature = "simd_x86", since = "1.27.0")]
3573#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3574pub const fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3575    unsafe {
3576        let r: i16x16 = simd_shuffle!(
3577            a.as_i16x16(),
3578            b.as_i16x16(),
3579            [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3580        );
3581        transmute(r)
3582    }
3583}
3584
3585/// Unpacks and interleave 32-bit integers from the high half of each
3586/// 128-bit lane of `a` and `b`.
3587///
3588/// ```rust
3589/// #[cfg(target_arch = "x86")]
3590/// use std::arch::x86::*;
3591/// #[cfg(target_arch = "x86_64")]
3592/// use std::arch::x86_64::*;
3593///
3594/// # fn main() {
3595/// #     if is_x86_feature_detected!("avx2") {
3596/// #         #[target_feature(enable = "avx2")]
3597/// #         unsafe fn worker() {
3598/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3599/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3600///
3601/// let c = _mm256_unpackhi_epi32(a, b);
3602///
3603/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3604/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3605///
3606/// #         }
3607/// #         unsafe { worker(); }
3608/// #     }
3609/// # }
3610/// ```
3611///
3612/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
3613#[inline]
3614#[target_feature(enable = "avx2")]
3615#[cfg_attr(test, assert_instr(vunpckhps))]
3616#[stable(feature = "simd_x86", since = "1.27.0")]
3617#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3618pub const fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3619    unsafe {
3620        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3621        transmute(r)
3622    }
3623}
3624
3625/// Unpacks and interleave 32-bit integers from the low half of each
3626/// 128-bit lane of `a` and `b`.
3627///
3628/// ```rust
3629/// #[cfg(target_arch = "x86")]
3630/// use std::arch::x86::*;
3631/// #[cfg(target_arch = "x86_64")]
3632/// use std::arch::x86_64::*;
3633///
3634/// # fn main() {
3635/// #     if is_x86_feature_detected!("avx2") {
3636/// #         #[target_feature(enable = "avx2")]
3637/// #         unsafe fn worker() {
3638/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3639/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3640///
3641/// let c = _mm256_unpacklo_epi32(a, b);
3642///
3643/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3644/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3645///
3646/// #         }
3647/// #         unsafe { worker(); }
3648/// #     }
3649/// # }
3650/// ```
3651///
3652/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
3653#[inline]
3654#[target_feature(enable = "avx2")]
3655#[cfg_attr(test, assert_instr(vunpcklps))]
3656#[stable(feature = "simd_x86", since = "1.27.0")]
3657#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3658pub const fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3659    unsafe {
3660        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3661        transmute(r)
3662    }
3663}
3664
3665/// Unpacks and interleave 64-bit integers from the high half of each
3666/// 128-bit lane of `a` and `b`.
3667///
3668/// ```rust
3669/// #[cfg(target_arch = "x86")]
3670/// use std::arch::x86::*;
3671/// #[cfg(target_arch = "x86_64")]
3672/// use std::arch::x86_64::*;
3673///
3674/// # fn main() {
3675/// #     if is_x86_feature_detected!("avx2") {
3676/// #         #[target_feature(enable = "avx2")]
3677/// #         unsafe fn worker() {
3678/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3679/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3680///
3681/// let c = _mm256_unpackhi_epi64(a, b);
3682///
3683/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3684/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3685///
3686/// #         }
3687/// #         unsafe { worker(); }
3688/// #     }
3689/// # }
3690/// ```
3691///
3692/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
3693#[inline]
3694#[target_feature(enable = "avx2")]
3695#[cfg_attr(test, assert_instr(vunpckhpd))]
3696#[stable(feature = "simd_x86", since = "1.27.0")]
3697#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3698pub const fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3699    unsafe {
3700        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3701        transmute(r)
3702    }
3703}
3704
3705/// Unpacks and interleave 64-bit integers from the low half of each
3706/// 128-bit lane of `a` and `b`.
3707///
3708/// ```rust
3709/// #[cfg(target_arch = "x86")]
3710/// use std::arch::x86::*;
3711/// #[cfg(target_arch = "x86_64")]
3712/// use std::arch::x86_64::*;
3713///
3714/// # fn main() {
3715/// #     if is_x86_feature_detected!("avx2") {
3716/// #         #[target_feature(enable = "avx2")]
3717/// #         unsafe fn worker() {
3718/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3719/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3720///
3721/// let c = _mm256_unpacklo_epi64(a, b);
3722///
3723/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3724/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3725///
3726/// #         }
3727/// #         unsafe { worker(); }
3728/// #     }
3729/// # }
3730/// ```
3731///
3732/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
3733#[inline]
3734#[target_feature(enable = "avx2")]
3735#[cfg_attr(test, assert_instr(vunpcklpd))]
3736#[stable(feature = "simd_x86", since = "1.27.0")]
3737#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3738pub const fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3739    unsafe {
3740        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3741        transmute(r)
3742    }
3743}
3744
3745/// Computes the bitwise XOR of 256 bits (representing integer data)
3746/// in `a` and `b`
3747///
3748/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
3749#[inline]
3750#[target_feature(enable = "avx2")]
3751#[cfg_attr(test, assert_instr(vxorps))]
3752#[stable(feature = "simd_x86", since = "1.27.0")]
3753#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3754pub const fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3755    unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) }
3756}
3757
3758/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3759/// integer containing the zero-extended integer data.
3760///
3761/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3762///
3763/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
3764#[inline]
3765#[target_feature(enable = "avx2")]
3766// This intrinsic has no corresponding instruction.
3767#[rustc_legacy_const_generics(1)]
3768#[stable(feature = "simd_x86", since = "1.27.0")]
3769#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3770pub const fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
3771    static_assert_uimm_bits!(INDEX, 5);
3772    unsafe { simd_extract!(a.as_u8x32(), INDEX as u32, u8) as i32 }
3773}
3774
3775/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3776/// integer containing the zero-extended integer data.
3777///
3778/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3779///
3780/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
3781#[inline]
3782#[target_feature(enable = "avx2")]
3783// This intrinsic has no corresponding instruction.
3784#[rustc_legacy_const_generics(1)]
3785#[stable(feature = "simd_x86", since = "1.27.0")]
3786#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3787pub const fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
3788    static_assert_uimm_bits!(INDEX, 4);
3789    unsafe { simd_extract!(a.as_u16x16(), INDEX as u32, u16) as i32 }
3790}
3791
3792#[allow(improper_ctypes)]
3793unsafe extern "C" {
3794    #[link_name = "llvm.x86.avx2.phadd.sw"]
3795    fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3796    #[link_name = "llvm.x86.avx2.phsub.sw"]
3797    fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3798    #[link_name = "llvm.x86.avx2.pmadd.wd"]
3799    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
3800    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3801    fn pmaddubsw(a: u8x32, b: i8x32) -> i16x16;
3802    #[link_name = "llvm.x86.avx2.mpsadbw"]
3803    fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16;
3804    #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3805    fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3806    #[link_name = "llvm.x86.avx2.packsswb"]
3807    fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3808    #[link_name = "llvm.x86.avx2.packssdw"]
3809    fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3810    #[link_name = "llvm.x86.avx2.packuswb"]
3811    fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3812    #[link_name = "llvm.x86.avx2.packusdw"]
3813    fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3814    #[link_name = "llvm.x86.avx2.psad.bw"]
3815    fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3816    #[link_name = "llvm.x86.avx2.psign.b"]
3817    fn psignb(a: i8x32, b: i8x32) -> i8x32;
3818    #[link_name = "llvm.x86.avx2.psign.w"]
3819    fn psignw(a: i16x16, b: i16x16) -> i16x16;
3820    #[link_name = "llvm.x86.avx2.psign.d"]
3821    fn psignd(a: i32x8, b: i32x8) -> i32x8;
3822    #[link_name = "llvm.x86.avx2.psll.w"]
3823    fn psllw(a: i16x16, count: i16x8) -> i16x16;
3824    #[link_name = "llvm.x86.avx2.psll.d"]
3825    fn pslld(a: i32x8, count: i32x4) -> i32x8;
3826    #[link_name = "llvm.x86.avx2.psll.q"]
3827    fn psllq(a: i64x4, count: i64x2) -> i64x4;
3828    #[link_name = "llvm.x86.avx2.psra.w"]
3829    fn psraw(a: i16x16, count: i16x8) -> i16x16;
3830    #[link_name = "llvm.x86.avx2.psra.d"]
3831    fn psrad(a: i32x8, count: i32x4) -> i32x8;
3832    #[link_name = "llvm.x86.avx2.psrl.w"]
3833    fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3834    #[link_name = "llvm.x86.avx2.psrl.d"]
3835    fn psrld(a: i32x8, count: i32x4) -> i32x8;
3836    #[link_name = "llvm.x86.avx2.psrl.q"]
3837    fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3838    #[link_name = "llvm.x86.avx2.pshuf.b"]
3839    fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3840    #[link_name = "llvm.x86.avx2.permd"]
3841    fn permd(a: u32x8, b: u32x8) -> u32x8;
3842    #[link_name = "llvm.x86.avx2.permps"]
3843    fn permps(a: __m256, b: i32x8) -> __m256;
3844    #[link_name = "llvm.x86.avx2.gather.d.d"]
3845    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3846    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3847    fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3848    #[link_name = "llvm.x86.avx2.gather.d.q"]
3849    fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3850    #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3851    fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3852    #[link_name = "llvm.x86.avx2.gather.q.d"]
3853    fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3854    #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3855    fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3856    #[link_name = "llvm.x86.avx2.gather.q.q"]
3857    fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3858    #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3859    fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3860    #[link_name = "llvm.x86.avx2.gather.d.pd"]
3861    fn pgatherdpd(
3862        src: __m128d,
3863        slice: *const i8,
3864        offsets: i32x4,
3865        mask: __m128d,
3866        scale: i8,
3867    ) -> __m128d;
3868    #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
3869    fn vpgatherdpd(
3870        src: __m256d,
3871        slice: *const i8,
3872        offsets: i32x4,
3873        mask: __m256d,
3874        scale: i8,
3875    ) -> __m256d;
3876    #[link_name = "llvm.x86.avx2.gather.q.pd"]
3877    fn pgatherqpd(
3878        src: __m128d,
3879        slice: *const i8,
3880        offsets: i64x2,
3881        mask: __m128d,
3882        scale: i8,
3883    ) -> __m128d;
3884    #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
3885    fn vpgatherqpd(
3886        src: __m256d,
3887        slice: *const i8,
3888        offsets: i64x4,
3889        mask: __m256d,
3890        scale: i8,
3891    ) -> __m256d;
3892    #[link_name = "llvm.x86.avx2.gather.d.ps"]
3893    fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
3894    -> __m128;
3895    #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
3896    fn vpgatherdps(
3897        src: __m256,
3898        slice: *const i8,
3899        offsets: i32x8,
3900        mask: __m256,
3901        scale: i8,
3902    ) -> __m256;
3903    #[link_name = "llvm.x86.avx2.gather.q.ps"]
3904    fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
3905    -> __m128;
3906    #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
3907    fn vpgatherqps(
3908        src: __m128,
3909        slice: *const i8,
3910        offsets: i64x4,
3911        mask: __m128,
3912        scale: i8,
3913    ) -> __m128;
3914}
3915
3916#[cfg(test)]
3917mod tests {
3918    use crate::core_arch::assert_eq_const as assert_eq;
3919
3920    use stdarch_test::simd_test;
3921
3922    use crate::core_arch::x86::*;
3923
3924    #[simd_test(enable = "avx2")]
3925    const fn test_mm256_abs_epi32() {
3926        #[rustfmt::skip]
3927        let a = _mm256_setr_epi32(
3928            0, 1, -1, i32::MAX,
3929            i32::MIN, 100, -100, -32,
3930        );
3931        let r = _mm256_abs_epi32(a);
3932        #[rustfmt::skip]
3933        let e = _mm256_setr_epi32(
3934            0, 1, 1, i32::MAX,
3935            i32::MAX.wrapping_add(1), 100, 100, 32,
3936        );
3937        assert_eq_m256i(r, e);
3938    }
3939
3940    #[simd_test(enable = "avx2")]
3941    const fn test_mm256_abs_epi16() {
3942        #[rustfmt::skip]
3943        let a = _mm256_setr_epi16(
3944            0,  1, -1, 2, -2, 3, -3, 4,
3945            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
3946        );
3947        let r = _mm256_abs_epi16(a);
3948        #[rustfmt::skip]
3949        let e = _mm256_setr_epi16(
3950            0, 1, 1, 2, 2, 3, 3, 4,
3951            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
3952        );
3953        assert_eq_m256i(r, e);
3954    }
3955
3956    #[simd_test(enable = "avx2")]
3957    const fn test_mm256_abs_epi8() {
3958        #[rustfmt::skip]
3959        let a = _mm256_setr_epi8(
3960            0, 1, -1, 2, -2, 3, -3, 4,
3961            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3962            0, 1, -1, 2, -2, 3, -3, 4,
3963            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3964        );
3965        let r = _mm256_abs_epi8(a);
3966        #[rustfmt::skip]
3967        let e = _mm256_setr_epi8(
3968            0, 1, 1, 2, 2, 3, 3, 4,
3969            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3970            0, 1, 1, 2, 2, 3, 3, 4,
3971            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3972        );
3973        assert_eq_m256i(r, e);
3974    }
3975
3976    #[simd_test(enable = "avx2")]
3977    const fn test_mm256_add_epi64() {
3978        let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
3979        let b = _mm256_setr_epi64x(-1, 0, 1, 2);
3980        let r = _mm256_add_epi64(a, b);
3981        let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
3982        assert_eq_m256i(r, e);
3983    }
3984
3985    #[simd_test(enable = "avx2")]
3986    const fn test_mm256_add_epi32() {
3987        let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
3988        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3989        let r = _mm256_add_epi32(a, b);
3990        let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
3991        assert_eq_m256i(r, e);
3992    }
3993
3994    #[simd_test(enable = "avx2")]
3995    const fn test_mm256_add_epi16() {
3996        #[rustfmt::skip]
3997        let a = _mm256_setr_epi16(
3998            0, 1, 2, 3, 4, 5, 6, 7,
3999            8, 9, 10, 11, 12, 13, 14, 15,
4000        );
4001        #[rustfmt::skip]
4002        let b = _mm256_setr_epi16(
4003            0, 1, 2, 3, 4, 5, 6, 7,
4004            8, 9, 10, 11, 12, 13, 14, 15,
4005        );
4006        let r = _mm256_add_epi16(a, b);
4007        #[rustfmt::skip]
4008        let e = _mm256_setr_epi16(
4009            0, 2, 4, 6, 8, 10, 12, 14,
4010            16, 18, 20, 22, 24, 26, 28, 30,
4011        );
4012        assert_eq_m256i(r, e);
4013    }
4014
4015    #[simd_test(enable = "avx2")]
4016    const fn test_mm256_add_epi8() {
4017        #[rustfmt::skip]
4018        let a = _mm256_setr_epi8(
4019            0, 1, 2, 3, 4, 5, 6, 7,
4020            8, 9, 10, 11, 12, 13, 14, 15,
4021            16, 17, 18, 19, 20, 21, 22, 23,
4022            24, 25, 26, 27, 28, 29, 30, 31,
4023        );
4024        #[rustfmt::skip]
4025        let b = _mm256_setr_epi8(
4026            0, 1, 2, 3, 4, 5, 6, 7,
4027            8, 9, 10, 11, 12, 13, 14, 15,
4028            16, 17, 18, 19, 20, 21, 22, 23,
4029            24, 25, 26, 27, 28, 29, 30, 31,
4030        );
4031        let r = _mm256_add_epi8(a, b);
4032        #[rustfmt::skip]
4033        let e = _mm256_setr_epi8(
4034            0, 2, 4, 6, 8, 10, 12, 14,
4035            16, 18, 20, 22, 24, 26, 28, 30,
4036            32, 34, 36, 38, 40, 42, 44, 46,
4037            48, 50, 52, 54, 56, 58, 60, 62,
4038        );
4039        assert_eq_m256i(r, e);
4040    }
4041
4042    #[simd_test(enable = "avx2")]
4043    const fn test_mm256_adds_epi8() {
4044        #[rustfmt::skip]
4045        let a = _mm256_setr_epi8(
4046            0, 1, 2, 3, 4, 5, 6, 7,
4047            8, 9, 10, 11, 12, 13, 14, 15,
4048            16, 17, 18, 19, 20, 21, 22, 23,
4049            24, 25, 26, 27, 28, 29, 30, 31,
4050        );
4051        #[rustfmt::skip]
4052        let b = _mm256_setr_epi8(
4053            32, 33, 34, 35, 36, 37, 38, 39,
4054            40, 41, 42, 43, 44, 45, 46, 47,
4055            48, 49, 50, 51, 52, 53, 54, 55,
4056            56, 57, 58, 59, 60, 61, 62, 63,
4057        );
4058        let r = _mm256_adds_epi8(a, b);
4059        #[rustfmt::skip]
4060        let e = _mm256_setr_epi8(
4061            32, 34, 36, 38, 40, 42, 44, 46,
4062            48, 50, 52, 54, 56, 58, 60, 62,
4063            64, 66, 68, 70, 72, 74, 76, 78,
4064            80, 82, 84, 86, 88, 90, 92, 94,
4065        );
4066        assert_eq_m256i(r, e);
4067    }
4068
4069    #[simd_test(enable = "avx2")]
4070    fn test_mm256_adds_epi8_saturate_positive() {
4071        let a = _mm256_set1_epi8(0x7F);
4072        let b = _mm256_set1_epi8(1);
4073        let r = _mm256_adds_epi8(a, b);
4074        assert_eq_m256i(r, a);
4075    }
4076
4077    #[simd_test(enable = "avx2")]
4078    fn test_mm256_adds_epi8_saturate_negative() {
4079        let a = _mm256_set1_epi8(-0x80);
4080        let b = _mm256_set1_epi8(-1);
4081        let r = _mm256_adds_epi8(a, b);
4082        assert_eq_m256i(r, a);
4083    }
4084
4085    #[simd_test(enable = "avx2")]
4086    const fn test_mm256_adds_epi16() {
4087        #[rustfmt::skip]
4088        let a = _mm256_setr_epi16(
4089            0, 1, 2, 3, 4, 5, 6, 7,
4090            8, 9, 10, 11, 12, 13, 14, 15,
4091        );
4092        #[rustfmt::skip]
4093        let b = _mm256_setr_epi16(
4094            32, 33, 34, 35, 36, 37, 38, 39,
4095            40, 41, 42, 43, 44, 45, 46, 47,
4096        );
4097        let r = _mm256_adds_epi16(a, b);
4098        #[rustfmt::skip]
4099        let e = _mm256_setr_epi16(
4100            32, 34, 36, 38, 40, 42, 44, 46,
4101            48, 50, 52, 54, 56, 58, 60, 62,
4102        );
4103
4104        assert_eq_m256i(r, e);
4105    }
4106
4107    #[simd_test(enable = "avx2")]
4108    fn test_mm256_adds_epi16_saturate_positive() {
4109        let a = _mm256_set1_epi16(0x7FFF);
4110        let b = _mm256_set1_epi16(1);
4111        let r = _mm256_adds_epi16(a, b);
4112        assert_eq_m256i(r, a);
4113    }
4114
4115    #[simd_test(enable = "avx2")]
4116    fn test_mm256_adds_epi16_saturate_negative() {
4117        let a = _mm256_set1_epi16(-0x8000);
4118        let b = _mm256_set1_epi16(-1);
4119        let r = _mm256_adds_epi16(a, b);
4120        assert_eq_m256i(r, a);
4121    }
4122
4123    #[simd_test(enable = "avx2")]
4124    const fn test_mm256_adds_epu8() {
4125        #[rustfmt::skip]
4126        let a = _mm256_setr_epi8(
4127            0, 1, 2, 3, 4, 5, 6, 7,
4128            8, 9, 10, 11, 12, 13, 14, 15,
4129            16, 17, 18, 19, 20, 21, 22, 23,
4130            24, 25, 26, 27, 28, 29, 30, 31,
4131        );
4132        #[rustfmt::skip]
4133        let b = _mm256_setr_epi8(
4134            32, 33, 34, 35, 36, 37, 38, 39,
4135            40, 41, 42, 43, 44, 45, 46, 47,
4136            48, 49, 50, 51, 52, 53, 54, 55,
4137            56, 57, 58, 59, 60, 61, 62, 63,
4138        );
4139        let r = _mm256_adds_epu8(a, b);
4140        #[rustfmt::skip]
4141        let e = _mm256_setr_epi8(
4142            32, 34, 36, 38, 40, 42, 44, 46,
4143            48, 50, 52, 54, 56, 58, 60, 62,
4144            64, 66, 68, 70, 72, 74, 76, 78,
4145            80, 82, 84, 86, 88, 90, 92, 94,
4146        );
4147        assert_eq_m256i(r, e);
4148    }
4149
4150    #[simd_test(enable = "avx2")]
4151    fn test_mm256_adds_epu8_saturate() {
4152        let a = _mm256_set1_epi8(!0);
4153        let b = _mm256_set1_epi8(1);
4154        let r = _mm256_adds_epu8(a, b);
4155        assert_eq_m256i(r, a);
4156    }
4157
4158    #[simd_test(enable = "avx2")]
4159    const fn test_mm256_adds_epu16() {
4160        #[rustfmt::skip]
4161        let a = _mm256_setr_epi16(
4162            0, 1, 2, 3, 4, 5, 6, 7,
4163            8, 9, 10, 11, 12, 13, 14, 15,
4164        );
4165        #[rustfmt::skip]
4166        let b = _mm256_setr_epi16(
4167            32, 33, 34, 35, 36, 37, 38, 39,
4168            40, 41, 42, 43, 44, 45, 46, 47,
4169        );
4170        let r = _mm256_adds_epu16(a, b);
4171        #[rustfmt::skip]
4172        let e = _mm256_setr_epi16(
4173            32, 34, 36, 38, 40, 42, 44, 46,
4174            48, 50, 52, 54, 56, 58, 60, 62,
4175        );
4176
4177        assert_eq_m256i(r, e);
4178    }
4179
4180    #[simd_test(enable = "avx2")]
4181    fn test_mm256_adds_epu16_saturate() {
4182        let a = _mm256_set1_epi16(!0);
4183        let b = _mm256_set1_epi16(1);
4184        let r = _mm256_adds_epu16(a, b);
4185        assert_eq_m256i(r, a);
4186    }
4187
4188    #[simd_test(enable = "avx2")]
4189    const fn test_mm256_and_si256() {
4190        let a = _mm256_set1_epi8(5);
4191        let b = _mm256_set1_epi8(3);
4192        let got = _mm256_and_si256(a, b);
4193        assert_eq_m256i(got, _mm256_set1_epi8(1));
4194    }
4195
4196    #[simd_test(enable = "avx2")]
4197    const fn test_mm256_andnot_si256() {
4198        let a = _mm256_set1_epi8(5);
4199        let b = _mm256_set1_epi8(3);
4200        let got = _mm256_andnot_si256(a, b);
4201        assert_eq_m256i(got, _mm256_set1_epi8(2));
4202    }
4203
4204    #[simd_test(enable = "avx2")]
4205    const fn test_mm256_avg_epu8() {
4206        let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4207        let r = _mm256_avg_epu8(a, b);
4208        assert_eq_m256i(r, _mm256_set1_epi8(6));
4209    }
4210
4211    #[simd_test(enable = "avx2")]
4212    const fn test_mm256_avg_epu16() {
4213        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4214        let r = _mm256_avg_epu16(a, b);
4215        assert_eq_m256i(r, _mm256_set1_epi16(6));
4216    }
4217
4218    #[simd_test(enable = "avx2")]
4219    const fn test_mm_blend_epi32() {
4220        let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4221        let e = _mm_setr_epi32(9, 3, 3, 3);
4222        let r = _mm_blend_epi32::<0x01>(a, b);
4223        assert_eq_m128i(r, e);
4224
4225        let r = _mm_blend_epi32::<0x0E>(b, a);
4226        assert_eq_m128i(r, e);
4227    }
4228
4229    #[simd_test(enable = "avx2")]
4230    const fn test_mm256_blend_epi32() {
4231        let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4232        let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4233        let r = _mm256_blend_epi32::<0x01>(a, b);
4234        assert_eq_m256i(r, e);
4235
4236        let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4237        let r = _mm256_blend_epi32::<0x82>(a, b);
4238        assert_eq_m256i(r, e);
4239
4240        let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4241        let r = _mm256_blend_epi32::<0x7C>(a, b);
4242        assert_eq_m256i(r, e);
4243    }
4244
4245    #[simd_test(enable = "avx2")]
4246    const fn test_mm256_blend_epi16() {
4247        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4248        let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4249        let r = _mm256_blend_epi16::<0x01>(a, b);
4250        assert_eq_m256i(r, e);
4251
4252        let r = _mm256_blend_epi16::<0xFE>(b, a);
4253        assert_eq_m256i(r, e);
4254    }
4255
4256    #[simd_test(enable = "avx2")]
4257    const fn test_mm256_blendv_epi8() {
4258        let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4259        let mask = _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), -1);
4260        let e = _mm256_insert_epi8::<2>(_mm256_set1_epi8(4), 2);
4261        let r = _mm256_blendv_epi8(a, b, mask);
4262        assert_eq_m256i(r, e);
4263    }
4264
4265    #[simd_test(enable = "avx2")]
4266    const fn test_mm_broadcastb_epi8() {
4267        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4268        let res = _mm_broadcastb_epi8(a);
4269        assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4270    }
4271
4272    #[simd_test(enable = "avx2")]
4273    const fn test_mm256_broadcastb_epi8() {
4274        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4275        let res = _mm256_broadcastb_epi8(a);
4276        assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4277    }
4278
4279    #[simd_test(enable = "avx2")]
4280    const fn test_mm_broadcastd_epi32() {
4281        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4282        let res = _mm_broadcastd_epi32(a);
4283        assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4284    }
4285
4286    #[simd_test(enable = "avx2")]
4287    const fn test_mm256_broadcastd_epi32() {
4288        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4289        let res = _mm256_broadcastd_epi32(a);
4290        assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4291    }
4292
4293    #[simd_test(enable = "avx2")]
4294    const fn test_mm_broadcastq_epi64() {
4295        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4296        let res = _mm_broadcastq_epi64(a);
4297        assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4298    }
4299
4300    #[simd_test(enable = "avx2")]
4301    const fn test_mm256_broadcastq_epi64() {
4302        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4303        let res = _mm256_broadcastq_epi64(a);
4304        assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4305    }
4306
4307    #[simd_test(enable = "avx2")]
4308    const fn test_mm_broadcastsd_pd() {
4309        let a = _mm_setr_pd(6.88, 3.44);
4310        let res = _mm_broadcastsd_pd(a);
4311        assert_eq_m128d(res, _mm_set1_pd(6.88));
4312    }
4313
4314    #[simd_test(enable = "avx2")]
4315    const fn test_mm256_broadcastsd_pd() {
4316        let a = _mm_setr_pd(6.88, 3.44);
4317        let res = _mm256_broadcastsd_pd(a);
4318        assert_eq_m256d(res, _mm256_set1_pd(6.88f64));
4319    }
4320
4321    #[simd_test(enable = "avx2")]
4322    const fn test_mm_broadcastsi128_si256() {
4323        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4324        let res = _mm_broadcastsi128_si256(a);
4325        let retval = _mm256_setr_epi64x(
4326            0x0987654321012334,
4327            0x5678909876543210,
4328            0x0987654321012334,
4329            0x5678909876543210,
4330        );
4331        assert_eq_m256i(res, retval);
4332    }
4333
4334    #[simd_test(enable = "avx2")]
4335    const fn test_mm256_broadcastsi128_si256() {
4336        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4337        let res = _mm256_broadcastsi128_si256(a);
4338        let retval = _mm256_setr_epi64x(
4339            0x0987654321012334,
4340            0x5678909876543210,
4341            0x0987654321012334,
4342            0x5678909876543210,
4343        );
4344        assert_eq_m256i(res, retval);
4345    }
4346
4347    #[simd_test(enable = "avx2")]
4348    const fn test_mm_broadcastss_ps() {
4349        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4350        let res = _mm_broadcastss_ps(a);
4351        assert_eq_m128(res, _mm_set1_ps(6.88));
4352    }
4353
4354    #[simd_test(enable = "avx2")]
4355    const fn test_mm256_broadcastss_ps() {
4356        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4357        let res = _mm256_broadcastss_ps(a);
4358        assert_eq_m256(res, _mm256_set1_ps(6.88));
4359    }
4360
4361    #[simd_test(enable = "avx2")]
4362    const fn test_mm_broadcastw_epi16() {
4363        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4364        let res = _mm_broadcastw_epi16(a);
4365        assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4366    }
4367
4368    #[simd_test(enable = "avx2")]
4369    const fn test_mm256_broadcastw_epi16() {
4370        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4371        let res = _mm256_broadcastw_epi16(a);
4372        assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4373    }
4374
4375    #[simd_test(enable = "avx2")]
4376    const fn test_mm256_cmpeq_epi8() {
4377        #[rustfmt::skip]
4378        let a = _mm256_setr_epi8(
4379            0, 1, 2, 3, 4, 5, 6, 7,
4380            8, 9, 10, 11, 12, 13, 14, 15,
4381            16, 17, 18, 19, 20, 21, 22, 23,
4382            24, 25, 26, 27, 28, 29, 30, 31,
4383        );
4384        #[rustfmt::skip]
4385        let b = _mm256_setr_epi8(
4386            31, 30, 2, 28, 27, 26, 25, 24,
4387            23, 22, 21, 20, 19, 18, 17, 16,
4388            15, 14, 13, 12, 11, 10, 9, 8,
4389            7, 6, 5, 4, 3, 2, 1, 0,
4390        );
4391        let r = _mm256_cmpeq_epi8(a, b);
4392        assert_eq_m256i(r, _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), !0));
4393    }
4394
4395    #[simd_test(enable = "avx2")]
4396    const fn test_mm256_cmpeq_epi16() {
4397        #[rustfmt::skip]
4398        let a = _mm256_setr_epi16(
4399            0, 1, 2, 3, 4, 5, 6, 7,
4400            8, 9, 10, 11, 12, 13, 14, 15,
4401        );
4402        #[rustfmt::skip]
4403        let b = _mm256_setr_epi16(
4404            15, 14, 2, 12, 11, 10, 9, 8,
4405            7, 6, 5, 4, 3, 2, 1, 0,
4406        );
4407        let r = _mm256_cmpeq_epi16(a, b);
4408        assert_eq_m256i(r, _mm256_insert_epi16::<2>(_mm256_set1_epi16(0), !0));
4409    }
4410
4411    #[simd_test(enable = "avx2")]
4412    const fn test_mm256_cmpeq_epi32() {
4413        let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4414        let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4415        let r = _mm256_cmpeq_epi32(a, b);
4416        let e = _mm256_set1_epi32(0);
4417        let e = _mm256_insert_epi32::<2>(e, !0);
4418        assert_eq_m256i(r, e);
4419    }
4420
4421    #[simd_test(enable = "avx2")]
4422    const fn test_mm256_cmpeq_epi64() {
4423        let a = _mm256_setr_epi64x(0, 1, 2, 3);
4424        let b = _mm256_setr_epi64x(3, 2, 2, 0);
4425        let r = _mm256_cmpeq_epi64(a, b);
4426        assert_eq_m256i(r, _mm256_insert_epi64::<2>(_mm256_set1_epi64x(0), !0));
4427    }
4428
4429    #[simd_test(enable = "avx2")]
4430    const fn test_mm256_cmpgt_epi8() {
4431        let a = _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), 5);
4432        let b = _mm256_set1_epi8(0);
4433        let r = _mm256_cmpgt_epi8(a, b);
4434        assert_eq_m256i(r, _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), !0));
4435    }
4436
4437    #[simd_test(enable = "avx2")]
4438    const fn test_mm256_cmpgt_epi16() {
4439        let a = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 5);
4440        let b = _mm256_set1_epi16(0);
4441        let r = _mm256_cmpgt_epi16(a, b);
4442        assert_eq_m256i(r, _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), !0));
4443    }
4444
4445    #[simd_test(enable = "avx2")]
4446    const fn test_mm256_cmpgt_epi32() {
4447        let a = _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), 5);
4448        let b = _mm256_set1_epi32(0);
4449        let r = _mm256_cmpgt_epi32(a, b);
4450        assert_eq_m256i(r, _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), !0));
4451    }
4452
4453    #[simd_test(enable = "avx2")]
4454    const fn test_mm256_cmpgt_epi64() {
4455        let a = _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), 5);
4456        let b = _mm256_set1_epi64x(0);
4457        let r = _mm256_cmpgt_epi64(a, b);
4458        assert_eq_m256i(r, _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), !0));
4459    }
4460
4461    #[simd_test(enable = "avx2")]
4462    const fn test_mm256_cvtepi8_epi16() {
4463        #[rustfmt::skip]
4464        let a = _mm_setr_epi8(
4465            0, 0, -1, 1, -2, 2, -3, 3,
4466            -4, 4, -5, 5, -6, 6, -7, 7,
4467        );
4468        #[rustfmt::skip]
4469        let r = _mm256_setr_epi16(
4470            0, 0, -1, 1, -2, 2, -3, 3,
4471            -4, 4, -5, 5, -6, 6, -7, 7,
4472        );
4473        assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4474    }
4475
4476    #[simd_test(enable = "avx2")]
4477    const fn test_mm256_cvtepi8_epi32() {
4478        #[rustfmt::skip]
4479        let a = _mm_setr_epi8(
4480            0, 0, -1, 1, -2, 2, -3, 3,
4481            -4, 4, -5, 5, -6, 6, -7, 7,
4482        );
4483        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4484        assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4485    }
4486
4487    #[simd_test(enable = "avx2")]
4488    const fn test_mm256_cvtepi8_epi64() {
4489        #[rustfmt::skip]
4490        let a = _mm_setr_epi8(
4491            0, 0, -1, 1, -2, 2, -3, 3,
4492            -4, 4, -5, 5, -6, 6, -7, 7,
4493        );
4494        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4495        assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4496    }
4497
4498    #[simd_test(enable = "avx2")]
4499    const fn test_mm256_cvtepi16_epi32() {
4500        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4501        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4502        assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4503    }
4504
4505    #[simd_test(enable = "avx2")]
4506    const fn test_mm256_cvtepi16_epi64() {
4507        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4508        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4509        assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4510    }
4511
4512    #[simd_test(enable = "avx2")]
4513    const fn test_mm256_cvtepi32_epi64() {
4514        let a = _mm_setr_epi32(0, 0, -1, 1);
4515        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4516        assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4517    }
4518
4519    #[simd_test(enable = "avx2")]
4520    const fn test_mm256_cvtepu16_epi32() {
4521        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4522        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4523        assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4524    }
4525
4526    #[simd_test(enable = "avx2")]
4527    const fn test_mm256_cvtepu16_epi64() {
4528        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4529        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4530        assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4531    }
4532
4533    #[simd_test(enable = "avx2")]
4534    const fn test_mm256_cvtepu32_epi64() {
4535        let a = _mm_setr_epi32(0, 1, 2, 3);
4536        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4537        assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4538    }
4539
4540    #[simd_test(enable = "avx2")]
4541    const fn test_mm256_cvtepu8_epi16() {
4542        #[rustfmt::skip]
4543        let a = _mm_setr_epi8(
4544            0, 1, 2, 3, 4, 5, 6, 7,
4545            8, 9, 10, 11, 12, 13, 14, 15,
4546        );
4547        #[rustfmt::skip]
4548        let r = _mm256_setr_epi16(
4549            0, 1, 2, 3, 4, 5, 6, 7,
4550            8, 9, 10, 11, 12, 13, 14, 15,
4551        );
4552        assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4553    }
4554
4555    #[simd_test(enable = "avx2")]
4556    const fn test_mm256_cvtepu8_epi32() {
4557        #[rustfmt::skip]
4558        let a = _mm_setr_epi8(
4559            0, 1, 2, 3, 4, 5, 6, 7,
4560            8, 9, 10, 11, 12, 13, 14, 15,
4561        );
4562        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4563        assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4564    }
4565
4566    #[simd_test(enable = "avx2")]
4567    const fn test_mm256_cvtepu8_epi64() {
4568        #[rustfmt::skip]
4569        let a = _mm_setr_epi8(
4570            0, 1, 2, 3, 4, 5, 6, 7,
4571            8, 9, 10, 11, 12, 13, 14, 15,
4572        );
4573        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4574        assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4575    }
4576
4577    #[simd_test(enable = "avx2")]
4578    const fn test_mm256_extracti128_si256() {
4579        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4580        let r = _mm256_extracti128_si256::<1>(a);
4581        let e = _mm_setr_epi64x(3, 4);
4582        assert_eq_m128i(r, e);
4583    }
4584
4585    #[simd_test(enable = "avx2")]
4586    const fn test_mm256_hadd_epi16() {
4587        let a = _mm256_set1_epi16(2);
4588        let b = _mm256_set1_epi16(4);
4589        let r = _mm256_hadd_epi16(a, b);
4590        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4591        assert_eq_m256i(r, e);
4592    }
4593
4594    #[simd_test(enable = "avx2")]
4595    const fn test_mm256_hadd_epi32() {
4596        let a = _mm256_set1_epi32(2);
4597        let b = _mm256_set1_epi32(4);
4598        let r = _mm256_hadd_epi32(a, b);
4599        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4600        assert_eq_m256i(r, e);
4601    }
4602
4603    #[simd_test(enable = "avx2")]
4604    fn test_mm256_hadds_epi16() {
4605        let a = _mm256_set1_epi16(2);
4606        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4607        let a = _mm256_insert_epi16::<1>(a, 1);
4608        let b = _mm256_set1_epi16(4);
4609        let r = _mm256_hadds_epi16(a, b);
4610        #[rustfmt::skip]
4611        let e = _mm256_setr_epi16(
4612            0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4613            4, 4, 4, 4, 8, 8, 8, 8,
4614        );
4615        assert_eq_m256i(r, e);
4616    }
4617
4618    #[simd_test(enable = "avx2")]
4619    const fn test_mm256_hsub_epi16() {
4620        let a = _mm256_set1_epi16(2);
4621        let b = _mm256_set1_epi16(4);
4622        let r = _mm256_hsub_epi16(a, b);
4623        let e = _mm256_set1_epi16(0);
4624        assert_eq_m256i(r, e);
4625    }
4626
4627    #[simd_test(enable = "avx2")]
4628    const fn test_mm256_hsub_epi32() {
4629        let a = _mm256_set1_epi32(2);
4630        let b = _mm256_set1_epi32(4);
4631        let r = _mm256_hsub_epi32(a, b);
4632        let e = _mm256_set1_epi32(0);
4633        assert_eq_m256i(r, e);
4634    }
4635
4636    #[simd_test(enable = "avx2")]
4637    fn test_mm256_hsubs_epi16() {
4638        let a = _mm256_set1_epi16(2);
4639        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4640        let a = _mm256_insert_epi16::<1>(a, -1);
4641        let b = _mm256_set1_epi16(4);
4642        let r = _mm256_hsubs_epi16(a, b);
4643        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
4644        assert_eq_m256i(r, e);
4645    }
4646
4647    #[simd_test(enable = "avx2")]
4648    fn test_mm256_madd_epi16() {
4649        let a = _mm256_set1_epi16(2);
4650        let b = _mm256_set1_epi16(4);
4651        let r = _mm256_madd_epi16(a, b);
4652        let e = _mm256_set1_epi32(16);
4653        assert_eq_m256i(r, e);
4654    }
4655
4656    #[simd_test(enable = "avx2")]
4657    const fn test_mm256_inserti128_si256() {
4658        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4659        let b = _mm_setr_epi64x(7, 8);
4660        let r = _mm256_inserti128_si256::<1>(a, b);
4661        let e = _mm256_setr_epi64x(1, 2, 7, 8);
4662        assert_eq_m256i(r, e);
4663    }
4664
4665    #[simd_test(enable = "avx2")]
4666    fn test_mm256_maddubs_epi16() {
4667        let a = _mm256_set1_epi8(2);
4668        let b = _mm256_set1_epi8(4);
4669        let r = _mm256_maddubs_epi16(a, b);
4670        let e = _mm256_set1_epi16(16);
4671        assert_eq_m256i(r, e);
4672    }
4673
4674    #[simd_test(enable = "avx2")]
4675    const unsafe fn test_mm_maskload_epi32() {
4676        let nums = [1, 2, 3, 4];
4677        let a = &nums as *const i32;
4678        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4679        let r = _mm_maskload_epi32(a, mask);
4680        let e = _mm_setr_epi32(1, 0, 0, 4);
4681        assert_eq_m128i(r, e);
4682    }
4683
4684    #[simd_test(enable = "avx2")]
4685    const unsafe fn test_mm256_maskload_epi32() {
4686        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4687        let a = &nums as *const i32;
4688        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4689        let r = _mm256_maskload_epi32(a, mask);
4690        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4691        assert_eq_m256i(r, e);
4692    }
4693
4694    #[simd_test(enable = "avx2")]
4695    const unsafe fn test_mm_maskload_epi64() {
4696        let nums = [1_i64, 2_i64];
4697        let a = &nums as *const i64;
4698        let mask = _mm_setr_epi64x(0, -1);
4699        let r = _mm_maskload_epi64(a, mask);
4700        let e = _mm_setr_epi64x(0, 2);
4701        assert_eq_m128i(r, e);
4702    }
4703
4704    #[simd_test(enable = "avx2")]
4705    const unsafe fn test_mm256_maskload_epi64() {
4706        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4707        let a = &nums as *const i64;
4708        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4709        let r = _mm256_maskload_epi64(a, mask);
4710        let e = _mm256_setr_epi64x(0, 2, 3, 0);
4711        assert_eq_m256i(r, e);
4712    }
4713
4714    #[simd_test(enable = "avx2")]
4715    const unsafe fn test_mm_maskstore_epi32() {
4716        let a = _mm_setr_epi32(1, 2, 3, 4);
4717        let mut arr = [-1, -1, -1, -1];
4718        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4719        _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4720        let e = [1, -1, -1, 4];
4721        assert_eq!(arr, e);
4722    }
4723
4724    #[simd_test(enable = "avx2")]
4725    const unsafe fn test_mm256_maskstore_epi32() {
4726        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4727        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4728        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4729        _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4730        let e = [1, -1, -1, 42, -1, 6, 7, -1];
4731        assert_eq!(arr, e);
4732    }
4733
4734    #[simd_test(enable = "avx2")]
4735    const unsafe fn test_mm_maskstore_epi64() {
4736        let a = _mm_setr_epi64x(1_i64, 2_i64);
4737        let mut arr = [-1_i64, -1_i64];
4738        let mask = _mm_setr_epi64x(0, -1);
4739        _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4740        let e = [-1, 2];
4741        assert_eq!(arr, e);
4742    }
4743
4744    #[simd_test(enable = "avx2")]
4745    const unsafe fn test_mm256_maskstore_epi64() {
4746        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4747        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4748        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4749        _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4750        let e = [-1, 2, 3, -1];
4751        assert_eq!(arr, e);
4752    }
4753
4754    #[simd_test(enable = "avx2")]
4755    const fn test_mm256_max_epi16() {
4756        let a = _mm256_set1_epi16(2);
4757        let b = _mm256_set1_epi16(4);
4758        let r = _mm256_max_epi16(a, b);
4759        assert_eq_m256i(r, b);
4760    }
4761
4762    #[simd_test(enable = "avx2")]
4763    const fn test_mm256_max_epi32() {
4764        let a = _mm256_set1_epi32(2);
4765        let b = _mm256_set1_epi32(4);
4766        let r = _mm256_max_epi32(a, b);
4767        assert_eq_m256i(r, b);
4768    }
4769
4770    #[simd_test(enable = "avx2")]
4771    const fn test_mm256_max_epi8() {
4772        let a = _mm256_set1_epi8(2);
4773        let b = _mm256_set1_epi8(4);
4774        let r = _mm256_max_epi8(a, b);
4775        assert_eq_m256i(r, b);
4776    }
4777
4778    #[simd_test(enable = "avx2")]
4779    const fn test_mm256_max_epu16() {
4780        let a = _mm256_set1_epi16(2);
4781        let b = _mm256_set1_epi16(4);
4782        let r = _mm256_max_epu16(a, b);
4783        assert_eq_m256i(r, b);
4784    }
4785
4786    #[simd_test(enable = "avx2")]
4787    const fn test_mm256_max_epu32() {
4788        let a = _mm256_set1_epi32(2);
4789        let b = _mm256_set1_epi32(4);
4790        let r = _mm256_max_epu32(a, b);
4791        assert_eq_m256i(r, b);
4792    }
4793
4794    #[simd_test(enable = "avx2")]
4795    const fn test_mm256_max_epu8() {
4796        let a = _mm256_set1_epi8(2);
4797        let b = _mm256_set1_epi8(4);
4798        let r = _mm256_max_epu8(a, b);
4799        assert_eq_m256i(r, b);
4800    }
4801
4802    #[simd_test(enable = "avx2")]
4803    const fn test_mm256_min_epi16() {
4804        let a = _mm256_set1_epi16(2);
4805        let b = _mm256_set1_epi16(4);
4806        let r = _mm256_min_epi16(a, b);
4807        assert_eq_m256i(r, a);
4808    }
4809
4810    #[simd_test(enable = "avx2")]
4811    const fn test_mm256_min_epi32() {
4812        let a = _mm256_set1_epi32(2);
4813        let b = _mm256_set1_epi32(4);
4814        let r = _mm256_min_epi32(a, b);
4815        assert_eq_m256i(r, a);
4816    }
4817
4818    #[simd_test(enable = "avx2")]
4819    const fn test_mm256_min_epi8() {
4820        let a = _mm256_set1_epi8(2);
4821        let b = _mm256_set1_epi8(4);
4822        let r = _mm256_min_epi8(a, b);
4823        assert_eq_m256i(r, a);
4824    }
4825
4826    #[simd_test(enable = "avx2")]
4827    const fn test_mm256_min_epu16() {
4828        let a = _mm256_set1_epi16(2);
4829        let b = _mm256_set1_epi16(4);
4830        let r = _mm256_min_epu16(a, b);
4831        assert_eq_m256i(r, a);
4832    }
4833
4834    #[simd_test(enable = "avx2")]
4835    const fn test_mm256_min_epu32() {
4836        let a = _mm256_set1_epi32(2);
4837        let b = _mm256_set1_epi32(4);
4838        let r = _mm256_min_epu32(a, b);
4839        assert_eq_m256i(r, a);
4840    }
4841
4842    #[simd_test(enable = "avx2")]
4843    const fn test_mm256_min_epu8() {
4844        let a = _mm256_set1_epi8(2);
4845        let b = _mm256_set1_epi8(4);
4846        let r = _mm256_min_epu8(a, b);
4847        assert_eq_m256i(r, a);
4848    }
4849
4850    #[simd_test(enable = "avx2")]
4851    const fn test_mm256_movemask_epi8() {
4852        let a = _mm256_set1_epi8(-1);
4853        let r = _mm256_movemask_epi8(a);
4854        let e = -1;
4855        assert_eq!(r, e);
4856    }
4857
4858    #[simd_test(enable = "avx2")]
4859    fn test_mm256_mpsadbw_epu8() {
4860        let a = _mm256_set1_epi8(2);
4861        let b = _mm256_set1_epi8(4);
4862        let r = _mm256_mpsadbw_epu8::<0>(a, b);
4863        let e = _mm256_set1_epi16(8);
4864        assert_eq_m256i(r, e);
4865    }
4866
4867    #[simd_test(enable = "avx2")]
4868    const fn test_mm256_mul_epi32() {
4869        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4870        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4871        let r = _mm256_mul_epi32(a, b);
4872        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4873        assert_eq_m256i(r, e);
4874    }
4875
4876    #[simd_test(enable = "avx2")]
4877    const fn test_mm256_mul_epu32() {
4878        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4879        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4880        let r = _mm256_mul_epu32(a, b);
4881        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4882        assert_eq_m256i(r, e);
4883    }
4884
4885    #[simd_test(enable = "avx2")]
4886    const fn test_mm256_mulhi_epi16() {
4887        let a = _mm256_set1_epi16(6535);
4888        let b = _mm256_set1_epi16(6535);
4889        let r = _mm256_mulhi_epi16(a, b);
4890        let e = _mm256_set1_epi16(651);
4891        assert_eq_m256i(r, e);
4892    }
4893
4894    #[simd_test(enable = "avx2")]
4895    const fn test_mm256_mulhi_epu16() {
4896        let a = _mm256_set1_epi16(6535);
4897        let b = _mm256_set1_epi16(6535);
4898        let r = _mm256_mulhi_epu16(a, b);
4899        let e = _mm256_set1_epi16(651);
4900        assert_eq_m256i(r, e);
4901    }
4902
4903    #[simd_test(enable = "avx2")]
4904    const fn test_mm256_mullo_epi16() {
4905        let a = _mm256_set1_epi16(2);
4906        let b = _mm256_set1_epi16(4);
4907        let r = _mm256_mullo_epi16(a, b);
4908        let e = _mm256_set1_epi16(8);
4909        assert_eq_m256i(r, e);
4910    }
4911
4912    #[simd_test(enable = "avx2")]
4913    const fn test_mm256_mullo_epi32() {
4914        let a = _mm256_set1_epi32(2);
4915        let b = _mm256_set1_epi32(4);
4916        let r = _mm256_mullo_epi32(a, b);
4917        let e = _mm256_set1_epi32(8);
4918        assert_eq_m256i(r, e);
4919    }
4920
4921    #[simd_test(enable = "avx2")]
4922    fn test_mm256_mulhrs_epi16() {
4923        let a = _mm256_set1_epi16(2);
4924        let b = _mm256_set1_epi16(4);
4925        let r = _mm256_mullo_epi16(a, b);
4926        let e = _mm256_set1_epi16(8);
4927        assert_eq_m256i(r, e);
4928    }
4929
4930    #[simd_test(enable = "avx2")]
4931    const fn test_mm256_or_si256() {
4932        let a = _mm256_set1_epi8(-1);
4933        let b = _mm256_set1_epi8(0);
4934        let r = _mm256_or_si256(a, b);
4935        assert_eq_m256i(r, a);
4936    }
4937
4938    #[simd_test(enable = "avx2")]
4939    fn test_mm256_packs_epi16() {
4940        let a = _mm256_set1_epi16(2);
4941        let b = _mm256_set1_epi16(4);
4942        let r = _mm256_packs_epi16(a, b);
4943        #[rustfmt::skip]
4944        let e = _mm256_setr_epi8(
4945            2, 2, 2, 2, 2, 2, 2, 2,
4946            4, 4, 4, 4, 4, 4, 4, 4,
4947            2, 2, 2, 2, 2, 2, 2, 2,
4948            4, 4, 4, 4, 4, 4, 4, 4,
4949        );
4950
4951        assert_eq_m256i(r, e);
4952    }
4953
4954    #[simd_test(enable = "avx2")]
4955    fn test_mm256_packs_epi32() {
4956        let a = _mm256_set1_epi32(2);
4957        let b = _mm256_set1_epi32(4);
4958        let r = _mm256_packs_epi32(a, b);
4959        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4960
4961        assert_eq_m256i(r, e);
4962    }
4963
4964    #[simd_test(enable = "avx2")]
4965    fn test_mm256_packus_epi16() {
4966        let a = _mm256_set1_epi16(2);
4967        let b = _mm256_set1_epi16(4);
4968        let r = _mm256_packus_epi16(a, b);
4969        #[rustfmt::skip]
4970        let e = _mm256_setr_epi8(
4971            2, 2, 2, 2, 2, 2, 2, 2,
4972            4, 4, 4, 4, 4, 4, 4, 4,
4973            2, 2, 2, 2, 2, 2, 2, 2,
4974            4, 4, 4, 4, 4, 4, 4, 4,
4975        );
4976
4977        assert_eq_m256i(r, e);
4978    }
4979
4980    #[simd_test(enable = "avx2")]
4981    fn test_mm256_packus_epi32() {
4982        let a = _mm256_set1_epi32(2);
4983        let b = _mm256_set1_epi32(4);
4984        let r = _mm256_packus_epi32(a, b);
4985        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4986
4987        assert_eq_m256i(r, e);
4988    }
4989
4990    #[simd_test(enable = "avx2")]
4991    fn test_mm256_sad_epu8() {
4992        let a = _mm256_set1_epi8(2);
4993        let b = _mm256_set1_epi8(4);
4994        let r = _mm256_sad_epu8(a, b);
4995        let e = _mm256_set1_epi64x(16);
4996        assert_eq_m256i(r, e);
4997    }
4998
4999    #[simd_test(enable = "avx2")]
5000    const fn test_mm256_shufflehi_epi16() {
5001        #[rustfmt::skip]
5002        let a = _mm256_setr_epi16(
5003            0, 1, 2, 3, 11, 22, 33, 44,
5004            4, 5, 6, 7, 55, 66, 77, 88,
5005        );
5006        #[rustfmt::skip]
5007        let e = _mm256_setr_epi16(
5008            0, 1, 2, 3, 44, 22, 22, 11,
5009            4, 5, 6, 7, 88, 66, 66, 55,
5010        );
5011        let r = _mm256_shufflehi_epi16::<0b00_01_01_11>(a);
5012        assert_eq_m256i(r, e);
5013    }
5014
5015    #[simd_test(enable = "avx2")]
5016    const fn test_mm256_shufflelo_epi16() {
5017        #[rustfmt::skip]
5018        let a = _mm256_setr_epi16(
5019            11, 22, 33, 44, 0, 1, 2, 3,
5020            55, 66, 77, 88, 4, 5, 6, 7,
5021        );
5022        #[rustfmt::skip]
5023        let e = _mm256_setr_epi16(
5024            44, 22, 22, 11, 0, 1, 2, 3,
5025            88, 66, 66, 55, 4, 5, 6, 7,
5026        );
5027        let r = _mm256_shufflelo_epi16::<0b00_01_01_11>(a);
5028        assert_eq_m256i(r, e);
5029    }
5030
5031    #[simd_test(enable = "avx2")]
5032    fn test_mm256_sign_epi16() {
5033        let a = _mm256_set1_epi16(2);
5034        let b = _mm256_set1_epi16(-1);
5035        let r = _mm256_sign_epi16(a, b);
5036        let e = _mm256_set1_epi16(-2);
5037        assert_eq_m256i(r, e);
5038    }
5039
5040    #[simd_test(enable = "avx2")]
5041    fn test_mm256_sign_epi32() {
5042        let a = _mm256_set1_epi32(2);
5043        let b = _mm256_set1_epi32(-1);
5044        let r = _mm256_sign_epi32(a, b);
5045        let e = _mm256_set1_epi32(-2);
5046        assert_eq_m256i(r, e);
5047    }
5048
5049    #[simd_test(enable = "avx2")]
5050    fn test_mm256_sign_epi8() {
5051        let a = _mm256_set1_epi8(2);
5052        let b = _mm256_set1_epi8(-1);
5053        let r = _mm256_sign_epi8(a, b);
5054        let e = _mm256_set1_epi8(-2);
5055        assert_eq_m256i(r, e);
5056    }
5057
5058    #[simd_test(enable = "avx2")]
5059    fn test_mm256_sll_epi16() {
5060        let a = _mm256_set1_epi16(0xFF);
5061        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5062        let r = _mm256_sll_epi16(a, b);
5063        assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
5064    }
5065
5066    #[simd_test(enable = "avx2")]
5067    fn test_mm256_sll_epi32() {
5068        let a = _mm256_set1_epi32(0xFFFF);
5069        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5070        let r = _mm256_sll_epi32(a, b);
5071        assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
5072    }
5073
5074    #[simd_test(enable = "avx2")]
5075    fn test_mm256_sll_epi64() {
5076        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5077        let b = _mm_insert_epi64::<0>(_mm_set1_epi64x(0), 4);
5078        let r = _mm256_sll_epi64(a, b);
5079        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
5080    }
5081
5082    #[simd_test(enable = "avx2")]
5083    const fn test_mm256_slli_epi16() {
5084        assert_eq_m256i(
5085            _mm256_slli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5086            _mm256_set1_epi16(0xFF0),
5087        );
5088    }
5089
5090    #[simd_test(enable = "avx2")]
5091    const fn test_mm256_slli_epi32() {
5092        assert_eq_m256i(
5093            _mm256_slli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5094            _mm256_set1_epi32(0xFFFF0),
5095        );
5096    }
5097
5098    #[simd_test(enable = "avx2")]
5099    const fn test_mm256_slli_epi64() {
5100        assert_eq_m256i(
5101            _mm256_slli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5102            _mm256_set1_epi64x(0xFFFFFFFF0),
5103        );
5104    }
5105
5106    #[simd_test(enable = "avx2")]
5107    const fn test_mm256_slli_si256() {
5108        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5109        let r = _mm256_slli_si256::<3>(a);
5110        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
5111    }
5112
5113    #[simd_test(enable = "avx2")]
5114    const fn test_mm_sllv_epi32() {
5115        let a = _mm_set1_epi32(2);
5116        let b = _mm_set1_epi32(1);
5117        let r = _mm_sllv_epi32(a, b);
5118        let e = _mm_set1_epi32(4);
5119        assert_eq_m128i(r, e);
5120    }
5121
5122    #[simd_test(enable = "avx2")]
5123    const fn test_mm256_sllv_epi32() {
5124        let a = _mm256_set1_epi32(2);
5125        let b = _mm256_set1_epi32(1);
5126        let r = _mm256_sllv_epi32(a, b);
5127        let e = _mm256_set1_epi32(4);
5128        assert_eq_m256i(r, e);
5129    }
5130
5131    #[simd_test(enable = "avx2")]
5132    const fn test_mm_sllv_epi64() {
5133        let a = _mm_set1_epi64x(2);
5134        let b = _mm_set1_epi64x(1);
5135        let r = _mm_sllv_epi64(a, b);
5136        let e = _mm_set1_epi64x(4);
5137        assert_eq_m128i(r, e);
5138    }
5139
5140    #[simd_test(enable = "avx2")]
5141    const fn test_mm256_sllv_epi64() {
5142        let a = _mm256_set1_epi64x(2);
5143        let b = _mm256_set1_epi64x(1);
5144        let r = _mm256_sllv_epi64(a, b);
5145        let e = _mm256_set1_epi64x(4);
5146        assert_eq_m256i(r, e);
5147    }
5148
5149    #[simd_test(enable = "avx2")]
5150    fn test_mm256_sra_epi16() {
5151        let a = _mm256_set1_epi16(-1);
5152        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5153        let r = _mm256_sra_epi16(a, b);
5154        assert_eq_m256i(r, _mm256_set1_epi16(-1));
5155    }
5156
5157    #[simd_test(enable = "avx2")]
5158    fn test_mm256_sra_epi32() {
5159        let a = _mm256_set1_epi32(-1);
5160        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 1);
5161        let r = _mm256_sra_epi32(a, b);
5162        assert_eq_m256i(r, _mm256_set1_epi32(-1));
5163    }
5164
5165    #[simd_test(enable = "avx2")]
5166    const fn test_mm256_srai_epi16() {
5167        assert_eq_m256i(
5168            _mm256_srai_epi16::<1>(_mm256_set1_epi16(-1)),
5169            _mm256_set1_epi16(-1),
5170        );
5171    }
5172
5173    #[simd_test(enable = "avx2")]
5174    const fn test_mm256_srai_epi32() {
5175        assert_eq_m256i(
5176            _mm256_srai_epi32::<1>(_mm256_set1_epi32(-1)),
5177            _mm256_set1_epi32(-1),
5178        );
5179    }
5180
5181    #[simd_test(enable = "avx2")]
5182    const fn test_mm_srav_epi32() {
5183        let a = _mm_set1_epi32(4);
5184        let count = _mm_set1_epi32(1);
5185        let r = _mm_srav_epi32(a, count);
5186        let e = _mm_set1_epi32(2);
5187        assert_eq_m128i(r, e);
5188    }
5189
5190    #[simd_test(enable = "avx2")]
5191    const fn test_mm256_srav_epi32() {
5192        let a = _mm256_set1_epi32(4);
5193        let count = _mm256_set1_epi32(1);
5194        let r = _mm256_srav_epi32(a, count);
5195        let e = _mm256_set1_epi32(2);
5196        assert_eq_m256i(r, e);
5197    }
5198
5199    #[simd_test(enable = "avx2")]
5200    const fn test_mm256_srli_si256() {
5201        #[rustfmt::skip]
5202        let a = _mm256_setr_epi8(
5203            1, 2, 3, 4, 5, 6, 7, 8,
5204            9, 10, 11, 12, 13, 14, 15, 16,
5205            17, 18, 19, 20, 21, 22, 23, 24,
5206            25, 26, 27, 28, 29, 30, 31, 32,
5207        );
5208        let r = _mm256_srli_si256::<3>(a);
5209        #[rustfmt::skip]
5210        let e = _mm256_setr_epi8(
5211            4, 5, 6, 7, 8, 9, 10, 11,
5212            12, 13, 14, 15, 16, 0, 0, 0,
5213            20, 21, 22, 23, 24, 25, 26, 27,
5214            28, 29, 30, 31, 32, 0, 0, 0,
5215        );
5216        assert_eq_m256i(r, e);
5217    }
5218
5219    #[simd_test(enable = "avx2")]
5220    fn test_mm256_srl_epi16() {
5221        let a = _mm256_set1_epi16(0xFF);
5222        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5223        let r = _mm256_srl_epi16(a, b);
5224        assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5225    }
5226
5227    #[simd_test(enable = "avx2")]
5228    fn test_mm256_srl_epi32() {
5229        let a = _mm256_set1_epi32(0xFFFF);
5230        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5231        let r = _mm256_srl_epi32(a, b);
5232        assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5233    }
5234
5235    #[simd_test(enable = "avx2")]
5236    fn test_mm256_srl_epi64() {
5237        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5238        let b = _mm_setr_epi64x(4, 0);
5239        let r = _mm256_srl_epi64(a, b);
5240        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5241    }
5242
5243    #[simd_test(enable = "avx2")]
5244    const fn test_mm256_srli_epi16() {
5245        assert_eq_m256i(
5246            _mm256_srli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5247            _mm256_set1_epi16(0xF),
5248        );
5249    }
5250
5251    #[simd_test(enable = "avx2")]
5252    const fn test_mm256_srli_epi32() {
5253        assert_eq_m256i(
5254            _mm256_srli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5255            _mm256_set1_epi32(0xFFF),
5256        );
5257    }
5258
5259    #[simd_test(enable = "avx2")]
5260    const fn test_mm256_srli_epi64() {
5261        assert_eq_m256i(
5262            _mm256_srli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5263            _mm256_set1_epi64x(0xFFFFFFF),
5264        );
5265    }
5266
5267    #[simd_test(enable = "avx2")]
5268    const fn test_mm_srlv_epi32() {
5269        let a = _mm_set1_epi32(2);
5270        let count = _mm_set1_epi32(1);
5271        let r = _mm_srlv_epi32(a, count);
5272        let e = _mm_set1_epi32(1);
5273        assert_eq_m128i(r, e);
5274    }
5275
5276    #[simd_test(enable = "avx2")]
5277    const fn test_mm256_srlv_epi32() {
5278        let a = _mm256_set1_epi32(2);
5279        let count = _mm256_set1_epi32(1);
5280        let r = _mm256_srlv_epi32(a, count);
5281        let e = _mm256_set1_epi32(1);
5282        assert_eq_m256i(r, e);
5283    }
5284
5285    #[simd_test(enable = "avx2")]
5286    const fn test_mm_srlv_epi64() {
5287        let a = _mm_set1_epi64x(2);
5288        let count = _mm_set1_epi64x(1);
5289        let r = _mm_srlv_epi64(a, count);
5290        let e = _mm_set1_epi64x(1);
5291        assert_eq_m128i(r, e);
5292    }
5293
5294    #[simd_test(enable = "avx2")]
5295    const fn test_mm256_srlv_epi64() {
5296        let a = _mm256_set1_epi64x(2);
5297        let count = _mm256_set1_epi64x(1);
5298        let r = _mm256_srlv_epi64(a, count);
5299        let e = _mm256_set1_epi64x(1);
5300        assert_eq_m256i(r, e);
5301    }
5302
5303    #[simd_test(enable = "avx2")]
5304    unsafe fn test_mm256_stream_load_si256() {
5305        let a = _mm256_set_epi64x(5, 6, 7, 8);
5306        let r = _mm256_stream_load_si256(core::ptr::addr_of!(a) as *const _);
5307        assert_eq_m256i(a, r);
5308    }
5309
5310    #[simd_test(enable = "avx2")]
5311    const fn test_mm256_sub_epi16() {
5312        let a = _mm256_set1_epi16(4);
5313        let b = _mm256_set1_epi16(2);
5314        let r = _mm256_sub_epi16(a, b);
5315        assert_eq_m256i(r, b);
5316    }
5317
5318    #[simd_test(enable = "avx2")]
5319    const fn test_mm256_sub_epi32() {
5320        let a = _mm256_set1_epi32(4);
5321        let b = _mm256_set1_epi32(2);
5322        let r = _mm256_sub_epi32(a, b);
5323        assert_eq_m256i(r, b);
5324    }
5325
5326    #[simd_test(enable = "avx2")]
5327    const fn test_mm256_sub_epi64() {
5328        let a = _mm256_set1_epi64x(4);
5329        let b = _mm256_set1_epi64x(2);
5330        let r = _mm256_sub_epi64(a, b);
5331        assert_eq_m256i(r, b);
5332    }
5333
5334    #[simd_test(enable = "avx2")]
5335    const fn test_mm256_sub_epi8() {
5336        let a = _mm256_set1_epi8(4);
5337        let b = _mm256_set1_epi8(2);
5338        let r = _mm256_sub_epi8(a, b);
5339        assert_eq_m256i(r, b);
5340    }
5341
5342    #[simd_test(enable = "avx2")]
5343    const fn test_mm256_subs_epi16() {
5344        let a = _mm256_set1_epi16(4);
5345        let b = _mm256_set1_epi16(2);
5346        let r = _mm256_subs_epi16(a, b);
5347        assert_eq_m256i(r, b);
5348    }
5349
5350    #[simd_test(enable = "avx2")]
5351    const fn test_mm256_subs_epi8() {
5352        let a = _mm256_set1_epi8(4);
5353        let b = _mm256_set1_epi8(2);
5354        let r = _mm256_subs_epi8(a, b);
5355        assert_eq_m256i(r, b);
5356    }
5357
5358    #[simd_test(enable = "avx2")]
5359    const fn test_mm256_subs_epu16() {
5360        let a = _mm256_set1_epi16(4);
5361        let b = _mm256_set1_epi16(2);
5362        let r = _mm256_subs_epu16(a, b);
5363        assert_eq_m256i(r, b);
5364    }
5365
5366    #[simd_test(enable = "avx2")]
5367    const fn test_mm256_subs_epu8() {
5368        let a = _mm256_set1_epi8(4);
5369        let b = _mm256_set1_epi8(2);
5370        let r = _mm256_subs_epu8(a, b);
5371        assert_eq_m256i(r, b);
5372    }
5373
5374    #[simd_test(enable = "avx2")]
5375    const fn test_mm256_xor_si256() {
5376        let a = _mm256_set1_epi8(5);
5377        let b = _mm256_set1_epi8(3);
5378        let r = _mm256_xor_si256(a, b);
5379        assert_eq_m256i(r, _mm256_set1_epi8(6));
5380    }
5381
5382    #[simd_test(enable = "avx2")]
5383    const fn test_mm256_alignr_epi8() {
5384        #[rustfmt::skip]
5385        let a = _mm256_setr_epi8(
5386            1, 2, 3, 4, 5, 6, 7, 8,
5387            9, 10, 11, 12, 13, 14, 15, 16,
5388            17, 18, 19, 20, 21, 22, 23, 24,
5389            25, 26, 27, 28, 29, 30, 31, 32,
5390        );
5391        #[rustfmt::skip]
5392        let b = _mm256_setr_epi8(
5393            -1, -2, -3, -4, -5, -6, -7, -8,
5394            -9, -10, -11, -12, -13, -14, -15, -16,
5395            -17, -18, -19, -20, -21, -22, -23, -24,
5396            -25, -26, -27, -28, -29, -30, -31, -32,
5397        );
5398        let r = _mm256_alignr_epi8::<33>(a, b);
5399        assert_eq_m256i(r, _mm256_set1_epi8(0));
5400
5401        let r = _mm256_alignr_epi8::<17>(a, b);
5402        #[rustfmt::skip]
5403        let expected = _mm256_setr_epi8(
5404            2, 3, 4, 5, 6, 7, 8, 9,
5405            10, 11, 12, 13, 14, 15, 16, 0,
5406            18, 19, 20, 21, 22, 23, 24, 25,
5407            26, 27, 28, 29, 30, 31, 32, 0,
5408        );
5409        assert_eq_m256i(r, expected);
5410
5411        let r = _mm256_alignr_epi8::<4>(a, b);
5412        #[rustfmt::skip]
5413        let expected = _mm256_setr_epi8(
5414            -5, -6, -7, -8, -9, -10, -11, -12,
5415            -13, -14, -15, -16, 1, 2, 3, 4,
5416            -21, -22, -23, -24, -25, -26, -27, -28,
5417            -29, -30, -31, -32, 17, 18, 19, 20,
5418        );
5419        assert_eq_m256i(r, expected);
5420
5421        let r = _mm256_alignr_epi8::<15>(a, b);
5422        #[rustfmt::skip]
5423        let expected = _mm256_setr_epi8(
5424            -16, 1, 2, 3, 4, 5, 6, 7,
5425            8, 9, 10, 11, 12, 13, 14, 15,
5426            -32, 17, 18, 19, 20, 21, 22, 23,
5427            24, 25, 26, 27, 28, 29, 30, 31,
5428        );
5429        assert_eq_m256i(r, expected);
5430
5431        let r = _mm256_alignr_epi8::<0>(a, b);
5432        assert_eq_m256i(r, b);
5433
5434        let r = _mm256_alignr_epi8::<16>(a, b);
5435        assert_eq_m256i(r, a);
5436    }
5437
5438    #[simd_test(enable = "avx2")]
5439    fn test_mm256_shuffle_epi8() {
5440        #[rustfmt::skip]
5441        let a = _mm256_setr_epi8(
5442            1, 2, 3, 4, 5, 6, 7, 8,
5443            9, 10, 11, 12, 13, 14, 15, 16,
5444            17, 18, 19, 20, 21, 22, 23, 24,
5445            25, 26, 27, 28, 29, 30, 31, 32,
5446        );
5447        #[rustfmt::skip]
5448        let b = _mm256_setr_epi8(
5449            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5450            12, 5, 5, 10, 4, 1, 8, 0,
5451            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5452            12, 5, 5, 10, 4, 1, 8, 0,
5453        );
5454        #[rustfmt::skip]
5455        let expected = _mm256_setr_epi8(
5456            5, 0, 5, 4, 9, 13, 7, 4,
5457            13, 6, 6, 11, 5, 2, 9, 1,
5458            21, 0, 21, 20, 25, 29, 23, 20,
5459            29, 22, 22, 27, 21, 18, 25, 17,
5460        );
5461        let r = _mm256_shuffle_epi8(a, b);
5462        assert_eq_m256i(r, expected);
5463    }
5464
5465    #[simd_test(enable = "avx2")]
5466    fn test_mm256_permutevar8x32_epi32() {
5467        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5468        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5469        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5470        let r = _mm256_permutevar8x32_epi32(a, b);
5471        assert_eq_m256i(r, expected);
5472    }
5473
5474    #[simd_test(enable = "avx2")]
5475    const fn test_mm256_permute4x64_epi64() {
5476        let a = _mm256_setr_epi64x(100, 200, 300, 400);
5477        let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5478        let r = _mm256_permute4x64_epi64::<0b00010011>(a);
5479        assert_eq_m256i(r, expected);
5480    }
5481
5482    #[simd_test(enable = "avx2")]
5483    const fn test_mm256_permute2x128_si256() {
5484        let a = _mm256_setr_epi64x(100, 200, 500, 600);
5485        let b = _mm256_setr_epi64x(300, 400, 700, 800);
5486        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
5487        let e = _mm256_setr_epi64x(700, 800, 500, 600);
5488        assert_eq_m256i(r, e);
5489    }
5490
5491    #[simd_test(enable = "avx2")]
5492    const fn test_mm256_permute4x64_pd() {
5493        let a = _mm256_setr_pd(1., 2., 3., 4.);
5494        let r = _mm256_permute4x64_pd::<0b00_01_00_11>(a);
5495        let e = _mm256_setr_pd(4., 1., 2., 1.);
5496        assert_eq_m256d(r, e);
5497    }
5498
5499    #[simd_test(enable = "avx2")]
5500    fn test_mm256_permutevar8x32_ps() {
5501        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5502        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5503        let r = _mm256_permutevar8x32_ps(a, b);
5504        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5505        assert_eq_m256(r, e);
5506    }
5507
5508    #[simd_test(enable = "avx2")]
5509    unsafe fn test_mm_i32gather_epi32() {
5510        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5511        // A multiplier of 4 is word-addressing
5512        let r = _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5513        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5514    }
5515
5516    #[simd_test(enable = "avx2")]
5517    unsafe fn test_mm_mask_i32gather_epi32() {
5518        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5519        // A multiplier of 4 is word-addressing
5520        let r = _mm_mask_i32gather_epi32::<4>(
5521            _mm_set1_epi32(256),
5522            arr.as_ptr(),
5523            _mm_setr_epi32(0, 16, 64, 96),
5524            _mm_setr_epi32(-1, -1, -1, 0),
5525        );
5526        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5527    }
5528
5529    #[simd_test(enable = "avx2")]
5530    unsafe fn test_mm256_i32gather_epi32() {
5531        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5532        // A multiplier of 4 is word-addressing
5533        let r =
5534            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5535        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5536    }
5537
5538    #[simd_test(enable = "avx2")]
5539    unsafe fn test_mm256_mask_i32gather_epi32() {
5540        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5541        // A multiplier of 4 is word-addressing
5542        let r = _mm256_mask_i32gather_epi32::<4>(
5543            _mm256_set1_epi32(256),
5544            arr.as_ptr(),
5545            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5546            _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5547        );
5548        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5549    }
5550
5551    #[simd_test(enable = "avx2")]
5552    unsafe fn test_mm_i32gather_ps() {
5553        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5554        // A multiplier of 4 is word-addressing for f32s
5555        let r = _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5556        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5557    }
5558
5559    #[simd_test(enable = "avx2")]
5560    unsafe fn test_mm_mask_i32gather_ps() {
5561        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5562        // A multiplier of 4 is word-addressing for f32s
5563        let r = _mm_mask_i32gather_ps::<4>(
5564            _mm_set1_ps(256.0),
5565            arr.as_ptr(),
5566            _mm_setr_epi32(0, 16, 64, 96),
5567            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5568        );
5569        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5570    }
5571
5572    #[simd_test(enable = "avx2")]
5573    unsafe fn test_mm256_i32gather_ps() {
5574        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5575        // A multiplier of 4 is word-addressing for f32s
5576        let r =
5577            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5578        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5579    }
5580
5581    #[simd_test(enable = "avx2")]
5582    unsafe fn test_mm256_mask_i32gather_ps() {
5583        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5584        // A multiplier of 4 is word-addressing for f32s
5585        let r = _mm256_mask_i32gather_ps::<4>(
5586            _mm256_set1_ps(256.0),
5587            arr.as_ptr(),
5588            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5589            _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5590        );
5591        assert_eq_m256(
5592            r,
5593            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5594        );
5595    }
5596
5597    #[simd_test(enable = "avx2")]
5598    unsafe fn test_mm_i32gather_epi64() {
5599        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5600        // A multiplier of 8 is word-addressing for i64s
5601        let r = _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5602        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5603    }
5604
5605    #[simd_test(enable = "avx2")]
5606    unsafe fn test_mm_mask_i32gather_epi64() {
5607        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5608        // A multiplier of 8 is word-addressing for i64s
5609        let r = _mm_mask_i32gather_epi64::<8>(
5610            _mm_set1_epi64x(256),
5611            arr.as_ptr(),
5612            _mm_setr_epi32(16, 16, 16, 16),
5613            _mm_setr_epi64x(-1, 0),
5614        );
5615        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5616    }
5617
5618    #[simd_test(enable = "avx2")]
5619    unsafe fn test_mm256_i32gather_epi64() {
5620        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5621        // A multiplier of 8 is word-addressing for i64s
5622        let r = _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5623        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5624    }
5625
5626    #[simd_test(enable = "avx2")]
5627    unsafe fn test_mm256_mask_i32gather_epi64() {
5628        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5629        // A multiplier of 8 is word-addressing for i64s
5630        let r = _mm256_mask_i32gather_epi64::<8>(
5631            _mm256_set1_epi64x(256),
5632            arr.as_ptr(),
5633            _mm_setr_epi32(0, 16, 64, 96),
5634            _mm256_setr_epi64x(-1, -1, -1, 0),
5635        );
5636        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5637    }
5638
5639    #[simd_test(enable = "avx2")]
5640    unsafe fn test_mm_i32gather_pd() {
5641        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5642        // A multiplier of 8 is word-addressing for f64s
5643        let r = _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5644        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5645    }
5646
5647    #[simd_test(enable = "avx2")]
5648    unsafe fn test_mm_mask_i32gather_pd() {
5649        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5650        // A multiplier of 8 is word-addressing for f64s
5651        let r = _mm_mask_i32gather_pd::<8>(
5652            _mm_set1_pd(256.0),
5653            arr.as_ptr(),
5654            _mm_setr_epi32(16, 16, 16, 16),
5655            _mm_setr_pd(-1.0, 0.0),
5656        );
5657        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5658    }
5659
5660    #[simd_test(enable = "avx2")]
5661    unsafe fn test_mm256_i32gather_pd() {
5662        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5663        // A multiplier of 8 is word-addressing for f64s
5664        let r = _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5665        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5666    }
5667
5668    #[simd_test(enable = "avx2")]
5669    unsafe fn test_mm256_mask_i32gather_pd() {
5670        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5671        // A multiplier of 8 is word-addressing for f64s
5672        let r = _mm256_mask_i32gather_pd::<8>(
5673            _mm256_set1_pd(256.0),
5674            arr.as_ptr(),
5675            _mm_setr_epi32(0, 16, 64, 96),
5676            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5677        );
5678        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5679    }
5680
5681    #[simd_test(enable = "avx2")]
5682    unsafe fn test_mm_i64gather_epi32() {
5683        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5684        // A multiplier of 4 is word-addressing
5685        let r = _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5686        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5687    }
5688
5689    #[simd_test(enable = "avx2")]
5690    unsafe fn test_mm_mask_i64gather_epi32() {
5691        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5692        // A multiplier of 4 is word-addressing
5693        let r = _mm_mask_i64gather_epi32::<4>(
5694            _mm_set1_epi32(256),
5695            arr.as_ptr(),
5696            _mm_setr_epi64x(0, 16),
5697            _mm_setr_epi32(-1, 0, -1, 0),
5698        );
5699        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5700    }
5701
5702    #[simd_test(enable = "avx2")]
5703    unsafe fn test_mm256_i64gather_epi32() {
5704        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5705        // A multiplier of 4 is word-addressing
5706        let r = _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5707        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5708    }
5709
5710    #[simd_test(enable = "avx2")]
5711    unsafe fn test_mm256_mask_i64gather_epi32() {
5712        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5713        // A multiplier of 4 is word-addressing
5714        let r = _mm256_mask_i64gather_epi32::<4>(
5715            _mm_set1_epi32(256),
5716            arr.as_ptr(),
5717            _mm256_setr_epi64x(0, 16, 64, 96),
5718            _mm_setr_epi32(-1, -1, -1, 0),
5719        );
5720        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5721    }
5722
5723    #[simd_test(enable = "avx2")]
5724    unsafe fn test_mm_i64gather_ps() {
5725        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5726        // A multiplier of 4 is word-addressing for f32s
5727        let r = _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5728        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5729    }
5730
5731    #[simd_test(enable = "avx2")]
5732    unsafe fn test_mm_mask_i64gather_ps() {
5733        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5734        // A multiplier of 4 is word-addressing for f32s
5735        let r = _mm_mask_i64gather_ps::<4>(
5736            _mm_set1_ps(256.0),
5737            arr.as_ptr(),
5738            _mm_setr_epi64x(0, 16),
5739            _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5740        );
5741        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5742    }
5743
5744    #[simd_test(enable = "avx2")]
5745    unsafe fn test_mm256_i64gather_ps() {
5746        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5747        // A multiplier of 4 is word-addressing for f32s
5748        let r = _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5749        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5750    }
5751
5752    #[simd_test(enable = "avx2")]
5753    unsafe fn test_mm256_mask_i64gather_ps() {
5754        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5755        // A multiplier of 4 is word-addressing for f32s
5756        let r = _mm256_mask_i64gather_ps::<4>(
5757            _mm_set1_ps(256.0),
5758            arr.as_ptr(),
5759            _mm256_setr_epi64x(0, 16, 64, 96),
5760            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5761        );
5762        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5763    }
5764
5765    #[simd_test(enable = "avx2")]
5766    unsafe fn test_mm_i64gather_epi64() {
5767        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5768        // A multiplier of 8 is word-addressing for i64s
5769        let r = _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5770        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5771    }
5772
5773    #[simd_test(enable = "avx2")]
5774    unsafe fn test_mm_mask_i64gather_epi64() {
5775        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5776        // A multiplier of 8 is word-addressing for i64s
5777        let r = _mm_mask_i64gather_epi64::<8>(
5778            _mm_set1_epi64x(256),
5779            arr.as_ptr(),
5780            _mm_setr_epi64x(16, 16),
5781            _mm_setr_epi64x(-1, 0),
5782        );
5783        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5784    }
5785
5786    #[simd_test(enable = "avx2")]
5787    unsafe fn test_mm256_i64gather_epi64() {
5788        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5789        // A multiplier of 8 is word-addressing for i64s
5790        let r = _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5791        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5792    }
5793
5794    #[simd_test(enable = "avx2")]
5795    unsafe fn test_mm256_mask_i64gather_epi64() {
5796        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5797        // A multiplier of 8 is word-addressing for i64s
5798        let r = _mm256_mask_i64gather_epi64::<8>(
5799            _mm256_set1_epi64x(256),
5800            arr.as_ptr(),
5801            _mm256_setr_epi64x(0, 16, 64, 96),
5802            _mm256_setr_epi64x(-1, -1, -1, 0),
5803        );
5804        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5805    }
5806
5807    #[simd_test(enable = "avx2")]
5808    unsafe fn test_mm_i64gather_pd() {
5809        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5810        // A multiplier of 8 is word-addressing for f64s
5811        let r = _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5812        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5813    }
5814
5815    #[simd_test(enable = "avx2")]
5816    unsafe fn test_mm_mask_i64gather_pd() {
5817        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5818        // A multiplier of 8 is word-addressing for f64s
5819        let r = _mm_mask_i64gather_pd::<8>(
5820            _mm_set1_pd(256.0),
5821            arr.as_ptr(),
5822            _mm_setr_epi64x(16, 16),
5823            _mm_setr_pd(-1.0, 0.0),
5824        );
5825        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5826    }
5827
5828    #[simd_test(enable = "avx2")]
5829    unsafe fn test_mm256_i64gather_pd() {
5830        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5831        // A multiplier of 8 is word-addressing for f64s
5832        let r = _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5833        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5834    }
5835
5836    #[simd_test(enable = "avx2")]
5837    unsafe fn test_mm256_mask_i64gather_pd() {
5838        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5839        // A multiplier of 8 is word-addressing for f64s
5840        let r = _mm256_mask_i64gather_pd::<8>(
5841            _mm256_set1_pd(256.0),
5842            arr.as_ptr(),
5843            _mm256_setr_epi64x(0, 16, 64, 96),
5844            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5845        );
5846        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5847    }
5848
5849    #[simd_test(enable = "avx2")]
5850    const fn test_mm256_extract_epi8() {
5851        #[rustfmt::skip]
5852        let a = _mm256_setr_epi8(
5853            -1, 1, 2, 3, 4, 5, 6, 7,
5854            8, 9, 10, 11, 12, 13, 14, 15,
5855            16, 17, 18, 19, 20, 21, 22, 23,
5856            24, 25, 26, 27, 28, 29, 30, 31
5857        );
5858        let r1 = _mm256_extract_epi8::<0>(a);
5859        let r2 = _mm256_extract_epi8::<3>(a);
5860        assert_eq!(r1, 0xFF);
5861        assert_eq!(r2, 3);
5862    }
5863
5864    #[simd_test(enable = "avx2")]
5865    const fn test_mm256_extract_epi16() {
5866        #[rustfmt::skip]
5867        let a = _mm256_setr_epi16(
5868            -1, 1, 2, 3, 4, 5, 6, 7,
5869            8, 9, 10, 11, 12, 13, 14, 15,
5870        );
5871        let r1 = _mm256_extract_epi16::<0>(a);
5872        let r2 = _mm256_extract_epi16::<3>(a);
5873        assert_eq!(r1, 0xFFFF);
5874        assert_eq!(r2, 3);
5875    }
5876}