Merge commit '3383cfbd35' into sync-from-portable-simd-2025-01-18

2025-01-18 15:37:14 -05:00 · 2025-01-18 15:37:14 -05:00 · 1ff6c555bb
commit 1ff6c555bb
parent 8321f00bf4 3383cfbd35
31 changed files with 865 additions and 388 deletions
--- a/library/portable-simd/crates/core_simd/Cargo.toml
+++ b/library/portable-simd/crates/core_simd/Cargo.toml
@ -9,10 +9,9 @@ categories = ["hardware-support", "no-std"]
 license = "MIT OR Apache-2.0"

 [features]
-default = ["as_crate"]
+default = ["as_crate", "std"]
 as_crate = []
 std = []
-all_lane_counts = []

 [target.'cfg(target_arch = "wasm32")'.dev-dependencies]
 wasm-bindgen = "0.2"
--- a/library/portable-simd/crates/core_simd/src/lane_count.rs
+++ b/library/portable-simd/crates/core_simd/src/lane_count.rs
@ -33,10 +33,8 @@ macro_rules! supported_lane_count {
    };
 }

-supported_lane_count!(1, 2, 4, 8, 16, 32, 64);
-#[cfg(feature = "all_lane_counts")]
 supported_lane_count!(
-    3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
-    31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
-    56, 57, 58, 59, 60, 61, 62, 63
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+    27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
+    51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
 );
--- a/library/portable-simd/crates/core_simd/src/lib.rs
+++ b/library/portable-simd/crates/core_simd/src/lib.rs
@ -1,7 +1,6 @@
 #![no_std]
 #![feature(
-    const_refs_to_cell,
-    const_mut_refs,
+    const_eval_select,
    convert_float_to_int,
    core_intrinsics,
    decl_macro,
@ -26,6 +25,7 @@
    all(target_arch = "arm", target_feature = "v7"),
    feature(stdarch_arm_neon_intrinsics)
 )]
+#![cfg_attr(target_arch = "loongarch64", feature(stdarch_loongarch))]
 #![cfg_attr(
    any(target_arch = "powerpc", target_arch = "powerpc64"),
    feature(stdarch_powerpc)
--- a/library/portable-simd/crates/core_simd/src/masks.rs
+++ b/library/portable-simd/crates/core_simd/src/masks.rs
@ -308,48 +308,6 @@ where
        Self(mask_impl::Mask::from_bitmask_integer(bitmask))
    }

-    /// Creates a bitmask vector from a mask.
-    ///
-    /// Each bit is set if the corresponding element in the mask is `true`.
-    /// The remaining bits are unset.
-    ///
-    /// The bits are packed into the first N bits of the vector:
-    /// ```
-    /// # #![feature(portable_simd)]
-    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
-    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::mask32x8;
-    /// let mask = mask32x8::from_array([true, false, true, false, false, false, true, false]);
-    /// assert_eq!(mask.to_bitmask_vector()[0], 0b01000101);
-    /// ```
-    #[inline]
-    #[must_use = "method returns a new integer and does not mutate the original value"]
-    pub fn to_bitmask_vector(self) -> Simd<u8, N> {
-        self.0.to_bitmask_vector()
-    }
-
-    /// Creates a mask from a bitmask vector.
-    ///
-    /// For each bit, if it is set, the corresponding element in the mask is set to `true`.
-    ///
-    /// The bits are packed into the first N bits of the vector:
-    /// ```
-    /// # #![feature(portable_simd)]
-    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
-    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{mask32x8, u8x8};
-    /// let bitmask = u8x8::from_array([0b01000101, 0, 0, 0, 0, 0, 0, 0]);
-    /// assert_eq!(
-    ///     mask32x8::from_bitmask_vector(bitmask),
-    ///     mask32x8::from_array([true, false, true, false, false, false, true, false]),
-    /// );
-    /// ```
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask_vector(bitmask: Simd<u8, N>) -> Self {
-        Self(mask_impl::Mask::from_bitmask_vector(bitmask))
-    }
-
    /// Finds the index of the first set element.
    ///
    /// ```
--- a/library/portable-simd/crates/core_simd/src/masks/bitmask.rs
+++ b/library/portable-simd/crates/core_simd/src/masks/bitmask.rs
@ -122,23 +122,6 @@ where
        unsafe { Self(core::intrinsics::simd::simd_bitmask(value), PhantomData) }
    }

-    #[inline]
-    #[must_use = "method returns a new vector and does not mutate the original value"]
-    pub fn to_bitmask_vector(self) -> Simd<u8, N> {
-        let mut bitmask = Simd::splat(0);
-        bitmask.as_mut_array()[..self.0.as_ref().len()].copy_from_slice(self.0.as_ref());
-        bitmask
-    }
-
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask_vector(bitmask: Simd<u8, N>) -> Self {
-        let mut bytes = <LaneCount<N> as SupportedLaneCount>::BitMask::default();
-        let len = bytes.as_ref().len();
-        bytes.as_mut().copy_from_slice(&bitmask.as_array()[..len]);
-        Self(bytes, PhantomData)
-    }
-
    #[inline]
    pub fn to_bitmask_integer(self) -> u64 {
        let mut bitmask = [0u8; 8];
--- a/library/portable-simd/crates/core_simd/src/masks/full_masks.rs
+++ b/library/portable-simd/crates/core_simd/src/masks/full_masks.rs
@ -140,62 +140,6 @@ where
        unsafe { Mask(core::intrinsics::simd::simd_cast(self.0)) }
    }

-    #[inline]
-    #[must_use = "method returns a new vector and does not mutate the original value"]
-    pub fn to_bitmask_vector(self) -> Simd<u8, N> {
-        let mut bitmask = Simd::splat(0);
-
-        // Safety: Bytes is the right size array
-        unsafe {
-            // Compute the bitmask
-            let mut bytes: <LaneCount<N> as SupportedLaneCount>::BitMask =
-                core::intrinsics::simd::simd_bitmask(self.0);
-
-            // LLVM assumes bit order should match endianness
-            if cfg!(target_endian = "big") {
-                for x in bytes.as_mut() {
-                    *x = x.reverse_bits()
-                }
-                if N % 8 > 0 {
-                    bytes.as_mut()[N / 8] >>= 8 - N % 8;
-                }
-            }
-
-            bitmask.as_mut_array()[..bytes.as_ref().len()].copy_from_slice(bytes.as_ref());
-        }
-
-        bitmask
-    }
-
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask_vector(bitmask: Simd<u8, N>) -> Self {
-        let mut bytes = <LaneCount<N> as SupportedLaneCount>::BitMask::default();
-
-        // Safety: Bytes is the right size array
-        unsafe {
-            let len = bytes.as_ref().len();
-            bytes.as_mut().copy_from_slice(&bitmask.as_array()[..len]);
-
-            // LLVM assumes bit order should match endianness
-            if cfg!(target_endian = "big") {
-                for x in bytes.as_mut() {
-                    *x = x.reverse_bits();
-                }
-                if N % 8 > 0 {
-                    bytes.as_mut()[N / 8] >>= 8 - N % 8;
-                }
-            }
-
-            // Compute the regular mask
-            Self::from_int_unchecked(core::intrinsics::simd::simd_select_bitmask(
-                bytes,
-                Self::splat(true).to_int(),
-                Self::splat(false).to_int(),
-            ))
-        }
-    }
-
    #[inline]
    unsafe fn to_bitmask_impl<U: ReverseBits, const M: usize>(self) -> U
    where
@ -283,7 +227,7 @@ where
    }

    #[inline]
-    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
    pub fn all(self) -> bool {
        // Safety: use `self` as an integer vector
        unsafe { core::intrinsics::simd::simd_reduce_all(self.to_int()) }
--- a/library/portable-simd/crates/core_simd/src/ops.rs
+++ b/library/portable-simd/crates/core_simd/src/ops.rs
@ -77,7 +77,7 @@ macro_rules! int_divrem_guard {
    (   $lhs:ident,
        $rhs:ident,
        {   const PANIC_ZERO: &'static str = $zero:literal;
-            $simd_call:ident
+            $simd_call:ident, $op:tt
        },
        $int:ident ) => {
        if $rhs.simd_eq(Simd::splat(0 as _)).any() {
@ -96,8 +96,23 @@ macro_rules! int_divrem_guard {
                // Nice base case to make it easy to const-fold away the other branch.
                $rhs
            };
-            // Safety: $lhs and rhs are vectors
-            unsafe { core::intrinsics::simd::$simd_call($lhs, rhs) }
+
+            // aarch64 div fails for arbitrary `v % 0`, mod fails when rhs is MIN, for non-powers-of-two
+            // these operations aren't vectorized on aarch64 anyway
+            #[cfg(target_arch = "aarch64")]
+            {
+                let mut out = Simd::splat(0 as _);
+                for i in 0..Self::LEN {
+                    out[i] = $lhs[i] $op rhs[i];
+                }
+                out
+            }
+
+            #[cfg(not(target_arch = "aarch64"))]
+            {
+                // Safety: $lhs and rhs are vectors
+                unsafe { core::intrinsics::simd::$simd_call($lhs, rhs) }
+            }
        }
    };
 }
@ -205,14 +220,14 @@ for_base_ops! {
    impl Div::div {
        int_divrem_guard {
            const PANIC_ZERO: &'static str = "attempt to divide by zero";
-            simd_div
+            simd_div, /
        }
    }

    impl Rem::rem {
        int_divrem_guard {
            const PANIC_ZERO: &'static str = "attempt to calculate the remainder with a divisor of zero";
-            simd_rem
+            simd_rem, %
        }
    }

--- a/library/portable-simd/crates/core_simd/src/simd/cmp/eq.rs
+++ b/library/portable-simd/crates/core_simd/src/simd/cmp/eq.rs
@ -12,7 +12,7 @@ pub trait SimdPartialEq {
    #[must_use = "method returns a new mask and does not mutate the original value"]
    fn simd_eq(self, other: Self) -> Self::Mask;

-    /// Test if each element is equal to the corresponding element in `other`.
+    /// Test if each element is not equal to the corresponding element in `other`.
    #[must_use = "method returns a new mask and does not mutate the original value"]
    fn simd_ne(self, other: Self) -> Self::Mask;
 }
--- a/library/portable-simd/crates/core_simd/src/simd/num/float.rs
+++ b/library/portable-simd/crates/core_simd/src/simd/num/float.rs
@ -255,6 +255,7 @@ macro_rules! impl_trait {
            type Bits = Simd<$bits_ty, N>;
            type Cast<T: SimdElement> = Simd<T, N>;

+            #[cfg(not(target_arch = "aarch64"))]
            #[inline]
            fn cast<T: SimdCast>(self) -> Self::Cast<T>
            {
@ -262,6 +263,33 @@ macro_rules! impl_trait {
                unsafe { core::intrinsics::simd::simd_as(self) }
            }

+            // https://github.com/llvm/llvm-project/issues/94694
+            #[cfg(target_arch = "aarch64")]
+            #[inline]
+            fn cast<T: SimdCast>(self) -> Self::Cast<T>
+            {
+                const { assert!(N <= 64) };
+                if N <= 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64 {
+                    // Safety: supported types are guaranteed by SimdCast
+                    unsafe { core::intrinsics::simd::simd_as(self) }
+                } else if N < 4 {
+                    let x = self.resize::<4>(Default::default()).cast();
+                    x.resize::<N>(x[0])
+                } else if N < 8 {
+                    let x = self.resize::<8>(Default::default()).cast();
+                    x.resize::<N>(x[0])
+                } else if N < 16 {
+                    let x = self.resize::<16>(Default::default()).cast();
+                    x.resize::<N>(x[0])
+                } else if N < 32 {
+                    let x = self.resize::<32>(Default::default()).cast();
+                    x.resize::<N>(x[0])
+                } else {
+                    let x = self.resize::<64>(Default::default()).cast();
+                    x.resize::<N>(x[0])
+                }
+            }
+
            #[inline]
            #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
            unsafe fn to_int_unchecked<I: SimdCast>(self) -> Self::Cast<I>
@ -391,7 +419,7 @@ macro_rules! impl_trait {
                    self.as_array().iter().sum()
                } else {
                    // Safety: `self` is a float vector
-                    unsafe { core::intrinsics::simd::simd_reduce_add_ordered(self, 0.) }
+                    unsafe { core::intrinsics::simd::simd_reduce_add_ordered(self, -0.) }
                }
            }

--- a/library/portable-simd/crates/core_simd/src/simd/num/int.rs
+++ b/library/portable-simd/crates/core_simd/src/simd/num/int.rs
@ -1,6 +1,6 @@
 use super::sealed::Sealed;
 use crate::simd::{
-    cmp::SimdPartialOrd, num::SimdUint, LaneCount, Mask, Simd, SimdCast, SimdElement,
+    cmp::SimdOrd, cmp::SimdPartialOrd, num::SimdUint, LaneCount, Mask, Simd, SimdCast, SimdElement,
    SupportedLaneCount,
 };

@ -70,11 +70,27 @@ pub trait SimdInt: Copy + Sealed {
    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
    /// # use simd::prelude::*;
    /// use core::i32::{MIN, MAX};
-    /// let xs = Simd::from_array([MIN, MIN +1, -5, 0]);
+    /// let xs = Simd::from_array([MIN, MIN + 1, -5, 0]);
    /// assert_eq!(xs.abs(), Simd::from_array([MIN, MAX, 5, 0]));
    /// ```
    fn abs(self) -> Self;

+    /// Lanewise absolute difference.
+    /// Every element becomes the absolute difference of `self` and `second`.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::prelude::*;
+    /// use core::i32::{MIN, MAX};
+    /// let a = Simd::from_array([MIN, MAX, 100, -100]);
+    /// let b = Simd::from_array([MAX, MIN, -80, -120]);
+    /// assert_eq!(a.abs_diff(b), Simd::from_array([u32::MAX, u32::MAX, 180, 20]));
+    /// ```
+    fn abs_diff(self, second: Self) -> Self::Unsigned;
+
    /// Lanewise saturating absolute value, implemented in Rust.
    /// As abs(), except the MIN value becomes MAX instead of itself.
    ///
@ -203,6 +219,12 @@ pub trait SimdInt: Copy + Sealed {
    /// The least significant bit becomes the most significant bit, second least-significant bit becomes second most-significant bit, etc.
    fn reverse_bits(self) -> Self;

+    /// Returns the number of ones in the binary representation of each element.
+    fn count_ones(self) -> Self::Unsigned;
+
+    /// Returns the number of zeros in the binary representation of each element.
+    fn count_zeros(self) -> Self::Unsigned;
+
    /// Returns the number of leading zeros in the binary representation of each element.
    fn leading_zeros(self) -> Self::Unsigned;

@ -259,6 +281,13 @@ macro_rules! impl_trait {
                (self^m) - m
            }

+            #[inline]
+            fn abs_diff(self, second: Self) -> Self::Unsigned {
+                let max = self.simd_max(second);
+                let min = self.simd_min(second);
+                (max - min).cast()
+            }
+
            #[inline]
            fn saturating_abs(self) -> Self {
                // arith shift for -1 or 0 mask based on sign bit, giving 2s complement
@ -344,6 +373,16 @@ macro_rules! impl_trait {
                unsafe { core::intrinsics::simd::simd_bitreverse(self) }
            }

+            #[inline]
+            fn count_ones(self) -> Self::Unsigned {
+                self.cast::<$unsigned>().count_ones()
+            }
+
+            #[inline]
+            fn count_zeros(self) -> Self::Unsigned {
+                self.cast::<$unsigned>().count_zeros()
+            }
+
            #[inline]
            fn leading_zeros(self) -> Self::Unsigned {
                self.cast::<$unsigned>().leading_zeros()
--- a/library/portable-simd/crates/core_simd/src/simd/num/uint.rs
+++ b/library/portable-simd/crates/core_simd/src/simd/num/uint.rs
@ -1,5 +1,5 @@
 use super::sealed::Sealed;
-use crate::simd::{LaneCount, Simd, SimdCast, SimdElement, SupportedLaneCount};
+use crate::simd::{cmp::SimdOrd, LaneCount, Simd, SimdCast, SimdElement, SupportedLaneCount};

 /// Operations on SIMD vectors of unsigned integers.
 pub trait SimdUint: Copy + Sealed {
@ -57,6 +57,22 @@ pub trait SimdUint: Copy + Sealed {
    /// assert_eq!(sat, Simd::splat(0));
    fn saturating_sub(self, second: Self) -> Self;

+    /// Lanewise absolute difference.
+    /// Every element becomes the absolute difference of `self` and `second`.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::prelude::*;
+    /// use core::u32::MAX;
+    /// let a = Simd::from_array([0, MAX, 100, 20]);
+    /// let b = Simd::from_array([MAX, 0, 80, 200]);
+    /// assert_eq!(a.abs_diff(b), Simd::from_array([MAX, MAX, 20, 180]));
+    /// ```
+    fn abs_diff(self, second: Self) -> Self;
+
    /// Returns the sum of the elements of the vector, with wrapping addition.
    fn reduce_sum(self) -> Self::Scalar;

@ -85,6 +101,12 @@ pub trait SimdUint: Copy + Sealed {
    /// The least significant bit becomes the most significant bit, second least-significant bit becomes second most-significant bit, etc.
    fn reverse_bits(self) -> Self;

+    /// Returns the number of ones in the binary representation of each element.
+    fn count_ones(self) -> Self;
+
+    /// Returns the number of zeros in the binary representation of each element.
+    fn count_zeros(self) -> Self;
+
    /// Returns the number of leading zeros in the binary representation of each element.
    fn leading_zeros(self) -> Self;

@ -138,6 +160,13 @@ macro_rules! impl_trait {
                unsafe { core::intrinsics::simd::simd_saturating_sub(self, second) }
            }

+            #[inline]
+            fn abs_diff(self, second: Self) -> Self {
+                let max = self.simd_max(second);
+                let min = self.simd_min(second);
+                max - min
+            }
+
            #[inline]
            fn reduce_sum(self) -> Self::Scalar {
                // Safety: `self` is an integer vector
@ -192,6 +221,17 @@ macro_rules! impl_trait {
                unsafe { core::intrinsics::simd::simd_bitreverse(self) }
            }

+            #[inline]
+            fn count_ones(self) -> Self {
+                // Safety: `self` is an integer vector
+                unsafe { core::intrinsics::simd::simd_ctpop(self) }
+            }
+
+            #[inline]
+            fn count_zeros(self) -> Self {
+                (!self).count_ones()
+            }
+
            #[inline]
            fn leading_zeros(self) -> Self {
                // Safety: `self` is an integer vector
--- a/library/portable-simd/crates/core_simd/src/simd/ptr/const_ptr.rs
+++ b/library/portable-simd/crates/core_simd/src/simd/ptr/const_ptr.rs
@ -42,6 +42,19 @@ pub trait SimdConstPtr: Copy + Sealed {
    /// Equivalent to calling [`pointer::addr`] on each element.
    fn addr(self) -> Self::Usize;

+    /// Converts an address to a pointer without giving it any provenance.
+    ///
+    /// Without provenance, this pointer is not associated with any actual allocation. Such a
+    /// no-provenance pointer may be used for zero-sized memory accesses (if suitably aligned), but
+    /// non-zero-sized memory accesses with a no-provenance pointer are UB. No-provenance pointers
+    /// are little more than a usize address in disguise.
+    ///
+    /// This is different from [`Self::with_exposed_provenance`], which creates a pointer that picks up a
+    /// previously exposed provenance.
+    ///
+    /// Equivalent to calling [`core::ptr::without_provenance`] on each element.
+    fn without_provenance(addr: Self::Usize) -> Self;
+
    /// Creates a new pointer with the given address.
    ///
    /// This performs the same operation as a cast, but copies the *address-space* and
@ -118,6 +131,14 @@ where
        unsafe { core::mem::transmute_copy(&self) }
    }

+    #[inline]
+    fn without_provenance(addr: Self::Usize) -> Self {
+        // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
+        // SAFETY: Integer-to-pointer transmutes are valid (if you are okay with not getting any
+        // provenance).
+        unsafe { core::mem::transmute_copy(&addr) }
+    }
+
    #[inline]
    fn with_addr(self, addr: Self::Usize) -> Self {
        // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
--- a/library/portable-simd/crates/core_simd/src/simd/ptr/mut_ptr.rs
+++ b/library/portable-simd/crates/core_simd/src/simd/ptr/mut_ptr.rs
@ -39,6 +39,19 @@ pub trait SimdMutPtr: Copy + Sealed {
    /// Equivalent to calling [`pointer::addr`] on each element.
    fn addr(self) -> Self::Usize;

+    /// Converts an address to a pointer without giving it any provenance.
+    ///
+    /// Without provenance, this pointer is not associated with any actual allocation. Such a
+    /// no-provenance pointer may be used for zero-sized memory accesses (if suitably aligned), but
+    /// non-zero-sized memory accesses with a no-provenance pointer are UB. No-provenance pointers
+    /// are little more than a usize address in disguise.
+    ///
+    /// This is different from [`Self::with_exposed_provenance`], which creates a pointer that picks up a
+    /// previously exposed provenance.
+    ///
+    /// Equivalent to calling [`core::ptr::without_provenance`] on each element.
+    fn without_provenance(addr: Self::Usize) -> Self;
+
    /// Creates a new pointer with the given address.
    ///
    /// This performs the same operation as a cast, but copies the *address-space* and
@ -115,6 +128,14 @@ where
        unsafe { core::mem::transmute_copy(&self) }
    }

+    #[inline]
+    fn without_provenance(addr: Self::Usize) -> Self {
+        // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
+        // SAFETY: Integer-to-pointer transmutes are valid (if you are okay with not getting any
+        // provenance).
+        unsafe { core::mem::transmute_copy(&addr) }
+    }
+
    #[inline]
    fn with_addr(self, addr: Self::Usize) -> Self {
        // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
--- a/library/portable-simd/crates/core_simd/src/swizzle.rs
+++ b/library/portable-simd/crates/core_simd/src/swizzle.rs
@ -155,8 +155,7 @@ pub trait Swizzle<const N: usize> {

    /// Creates a new mask from the elements of `mask`.
    ///
-    /// Element `i` of the output is `concat[Self::INDEX[i]]`, where `concat` is the concatenation of
-    /// `first` and `second`.
+    /// Element `i` of the output is `mask[Self::INDEX[i]]`.
    #[inline]
    #[must_use = "method returns a new mask and does not mutate the original inputs"]
    fn swizzle_mask<T, const M: usize>(mask: Mask<T, M>) -> Mask<T, N>
@ -260,6 +259,50 @@ where
        Rotate::<OFFSET>::swizzle(self)
    }

+    /// Shifts the vector elements to the left by `OFFSET`, filling in with
+    /// `padding` from the right.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn shift_elements_left<const OFFSET: usize>(self, padding: T) -> Self {
+        struct Shift<const OFFSET: usize>;
+
+        impl<const OFFSET: usize, const N: usize> Swizzle<N> for Shift<OFFSET> {
+            const INDEX: [usize; N] = const {
+                let mut index = [N; N];
+                let mut i = 0;
+                while i + OFFSET < N {
+                    index[i] = i + OFFSET;
+                    i += 1;
+                }
+                index
+            };
+        }
+
+        Shift::<OFFSET>::concat_swizzle(self, Simd::splat(padding))
+    }
+
+    /// Shifts the vector elements to the right by `OFFSET`, filling in with
+    /// `padding` from the left.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn shift_elements_right<const OFFSET: usize>(self, padding: T) -> Self {
+        struct Shift<const OFFSET: usize>;
+
+        impl<const OFFSET: usize, const N: usize> Swizzle<N> for Shift<OFFSET> {
+            const INDEX: [usize; N] = const {
+                let mut index = [N; N];
+                let mut i = OFFSET;
+                while i < N {
+                    index[i] = i - OFFSET;
+                    i += 1;
+                }
+                index
+            };
+        }
+
+        Shift::<OFFSET>::concat_swizzle(self, Simd::splat(padding))
+    }
+
    /// Interleave two vectors.
    ///
    /// The resulting vectors contain elements taken alternatively from `self` and `other`, first
@ -320,7 +363,9 @@ where
    ///
    /// ```
    /// # #![feature(portable_simd)]
-    /// # use core::simd::Simd;
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::Simd;
    /// let a = Simd::from_array([0, 4, 1, 5]);
    /// let b = Simd::from_array([2, 6, 3, 7]);
    /// let (x, y) = a.deinterleave(b);
@ -391,4 +436,210 @@ where
        }
        Resize::<N>::concat_swizzle(self, Simd::splat(value))
    }
+
+    /// Extract a vector from another vector.
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::u32x4;
+    /// let x = u32x4::from_array([0, 1, 2, 3]);
+    /// assert_eq!(x.extract::<1, 2>().to_array(), [1, 2]);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn extract<const START: usize, const LEN: usize>(self) -> Simd<T, LEN>
+    where
+        LaneCount<LEN>: SupportedLaneCount,
+    {
+        struct Extract<const N: usize, const START: usize>;
+        impl<const N: usize, const START: usize, const LEN: usize> Swizzle<LEN> for Extract<N, START> {
+            const INDEX: [usize; LEN] = const {
+                assert!(START + LEN <= N, "index out of bounds");
+                let mut index = [0; LEN];
+                let mut i = 0;
+                while i < LEN {
+                    index[i] = START + i;
+                    i += 1;
+                }
+                index
+            };
+        }
+        Extract::<N, START>::swizzle(self)
+    }
+}
+
+impl<T, const N: usize> Mask<T, N>
+where
+    T: MaskElement,
+    LaneCount<N>: SupportedLaneCount,
+{
+    /// Reverse the order of the elements in the mask.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn reverse(self) -> Self {
+        // Safety: swizzles are safe for masks
+        unsafe { Self::from_int_unchecked(self.to_int().reverse()) }
+    }
+
+    /// Rotates the mask such that the first `OFFSET` elements of the slice move to the end
+    /// while the last `self.len() - OFFSET` elements move to the front. After calling `rotate_elements_left`,
+    /// the element previously at index `OFFSET` will become the first element in the slice.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn rotate_elements_left<const OFFSET: usize>(self) -> Self {
+        // Safety: swizzles are safe for masks
+        unsafe { Self::from_int_unchecked(self.to_int().rotate_elements_left::<OFFSET>()) }
+    }
+
+    /// Rotates the mask such that the first `self.len() - OFFSET` elements of the mask move to
+    /// the end while the last `OFFSET` elements move to the front. After calling `rotate_elements_right`,
+    /// the element previously at index `self.len() - OFFSET` will become the first element in the slice.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn rotate_elements_right<const OFFSET: usize>(self) -> Self {
+        // Safety: swizzles are safe for masks
+        unsafe { Self::from_int_unchecked(self.to_int().rotate_elements_right::<OFFSET>()) }
+    }
+
+    /// Shifts the mask elements to the left by `OFFSET`, filling in with
+    /// `padding` from the right.
+    #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original inputs"]
+    pub fn shift_elements_left<const OFFSET: usize>(self, padding: bool) -> Self {
+        // Safety: swizzles are safe for masks
+        unsafe {
+            Self::from_int_unchecked(self.to_int().shift_elements_left::<OFFSET>(if padding {
+                T::TRUE
+            } else {
+                T::FALSE
+            }))
+        }
+    }
+
+    /// Shifts the mask elements to the right by `OFFSET`, filling in with
+    /// `padding` from the left.
+    #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original inputs"]
+    pub fn shift_elements_right<const OFFSET: usize>(self, padding: bool) -> Self {
+        // Safety: swizzles are safe for masks
+        unsafe {
+            Self::from_int_unchecked(self.to_int().shift_elements_right::<OFFSET>(if padding {
+                T::TRUE
+            } else {
+                T::FALSE
+            }))
+        }
+    }
+
+    /// Interleave two masks.
+    ///
+    /// The resulting masks contain elements taken alternatively from `self` and `other`, first
+    /// filling the first result, and then the second.
+    ///
+    /// The reverse of this operation is [`Mask::deinterleave`].
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::mask32x4;
+    /// let a = mask32x4::from_array([false, true, false, true]);
+    /// let b = mask32x4::from_array([false, false, true, true]);
+    /// let (x, y) = a.interleave(b);
+    /// assert_eq!(x.to_array(), [false, false, true, false]);
+    /// assert_eq!(y.to_array(), [false, true, true, true]);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn interleave(self, other: Self) -> (Self, Self) {
+        let (lo, hi) = self.to_int().interleave(other.to_int());
+        // Safety: swizzles are safe for masks
+        unsafe { (Self::from_int_unchecked(lo), Self::from_int_unchecked(hi)) }
+    }
+
+    /// Deinterleave two masks.
+    ///
+    /// The first result takes every other element of `self` and then `other`, starting with
+    /// the first element.
+    ///
+    /// The second result takes every other element of `self` and then `other`, starting with
+    /// the second element.
+    ///
+    /// The reverse of this operation is [`Mask::interleave`].
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::mask32x4;
+    /// let a = mask32x4::from_array([false, true, false, true]);
+    /// let b = mask32x4::from_array([false, false, true, true]);
+    /// let (x, y) = a.deinterleave(b);
+    /// assert_eq!(x.to_array(), [false, false, false, true]);
+    /// assert_eq!(y.to_array(), [true, true, false, true]);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn deinterleave(self, other: Self) -> (Self, Self) {
+        let (even, odd) = self.to_int().deinterleave(other.to_int());
+        // Safety: swizzles are safe for masks
+        unsafe {
+            (
+                Self::from_int_unchecked(even),
+                Self::from_int_unchecked(odd),
+            )
+        }
+    }
+
+    /// Resize a mask.
+    ///
+    /// If `M` > `N`, extends the length of a mask, setting the new elements to `value`.
+    /// If `M` < `N`, truncates the mask to the first `M` elements.
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::mask32x4;
+    /// let x = mask32x4::from_array([false, true, true, false]);
+    /// assert_eq!(x.resize::<8>(true).to_array(), [false, true, true, false, true, true, true, true]);
+    /// assert_eq!(x.resize::<2>(true).to_array(), [false, true]);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn resize<const M: usize>(self, value: bool) -> Mask<T, M>
+    where
+        LaneCount<M>: SupportedLaneCount,
+    {
+        // Safety: swizzles are safe for masks
+        unsafe {
+            Mask::<T, M>::from_int_unchecked(self.to_int().resize::<M>(if value {
+                T::TRUE
+            } else {
+                T::FALSE
+            }))
+        }
+    }
+
+    /// Extract a vector from another vector.
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::mask32x4;
+    /// let x = mask32x4::from_array([false, true, true, false]);
+    /// assert_eq!(x.extract::<1, 2>().to_array(), [true, true]);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn extract<const START: usize, const LEN: usize>(self) -> Mask<T, LEN>
+    where
+        LaneCount<LEN>: SupportedLaneCount,
+    {
+        // Safety: swizzles are safe for masks
+        unsafe { Mask::<T, LEN>::from_int_unchecked(self.to_int().extract::<START, LEN>()) }
+    }
 }
--- a/library/portable-simd/crates/core_simd/src/swizzle_dyn.rs
+++ b/library/portable-simd/crates/core_simd/src/swizzle_dyn.rs
@ -59,15 +59,40 @@ where
                    target_endian = "little"
                ))]
                16 => transize(vqtbl1q_u8, self, idxs),
+                #[cfg(all(
+                    target_arch = "arm",
+                    target_feature = "v7",
+                    target_feature = "neon",
+                    target_endian = "little"
+                ))]
+                16 => transize(armv7_neon_swizzle_u8x16, self, idxs),
                #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
                32 => transize(avx2_pshufb, self, idxs),
                #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
-                32 => transize(x86::_mm256_permutexvar_epi8, zeroing_idxs(idxs), self),
-                // Notable absence: avx512bw shuffle
-                // If avx512bw is available, odds of avx512vbmi are good
-                // FIXME: initial AVX512VBMI variant didn't actually pass muster
-                // #[cfg(target_feature = "avx512vbmi")]
-                // 64 => transize(x86::_mm512_permutexvar_epi8, self, idxs),
+                32 => {
+                    // Unlike vpshufb, vpermb doesn't zero out values in the result based on the index high bit
+                    let swizzler = |bytes, idxs| {
+                        let mask = x86::_mm256_cmp_epu8_mask::<{ x86::_MM_CMPINT_LT }>(
+                            idxs,
+                            Simd::<u8, 32>::splat(N as u8).into(),
+                        );
+                        x86::_mm256_maskz_permutexvar_epi8(mask, idxs, bytes)
+                    };
+                    transize(swizzler, self, idxs)
+                }
+                // Notable absence: avx512bw pshufb shuffle
+                #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
+                64 => {
+                    // Unlike vpshufb, vpermb doesn't zero out values in the result based on the index high bit
+                    let swizzler = |bytes, idxs| {
+                        let mask = x86::_mm512_cmp_epu8_mask::<{ x86::_MM_CMPINT_LT }>(
+                            idxs,
+                            Simd::<u8, 64>::splat(N as u8).into(),
+                        );
+                        x86::_mm512_maskz_permutexvar_epi8(mask, idxs, bytes)
+                    };
+                    transize(swizzler, self, idxs)
+                }
                _ => {
                    let mut array = [0; N];
                    for (i, k) in idxs.to_array().into_iter().enumerate() {
@ -82,6 +107,28 @@ where
    }
 }

+/// armv7 neon supports swizzling `u8x16` by swizzling two u8x8 blocks
+/// with a u8x8x2 lookup table.
+///
+/// # Safety
+/// This requires armv7 neon to work
+#[cfg(all(
+    target_arch = "arm",
+    target_feature = "v7",
+    target_feature = "neon",
+    target_endian = "little"
+))]
+unsafe fn armv7_neon_swizzle_u8x16(bytes: Simd<u8, 16>, idxs: Simd<u8, 16>) -> Simd<u8, 16> {
+    use core::arch::arm::{uint8x8x2_t, vcombine_u8, vget_high_u8, vget_low_u8, vtbl2_u8};
+    // SAFETY: Caller promised arm neon support
+    unsafe {
+        let bytes = uint8x8x2_t(vget_low_u8(bytes.into()), vget_high_u8(bytes.into()));
+        let lo = vtbl2_u8(bytes, vget_low_u8(idxs.into()));
+        let hi = vtbl2_u8(bytes, vget_high_u8(idxs.into()));
+        vcombine_u8(lo, hi).into()
+    }
+}
+
 /// "vpshufb like it was meant to be" on AVX2
 ///
 /// # Safety
--- a/library/portable-simd/crates/core_simd/src/vector.rs
+++ b/library/portable-simd/crates/core_simd/src/vector.rs
@ -99,7 +99,7 @@ use crate::simd::{
 // directly constructing an instance of the type (i.e. `let vector = Simd(array)`) should be
 // avoided, as it will likely become illegal on `#[repr(simd)]` structs in the future. It also
 // causes rustc to emit illegal LLVM IR in some cases.
-#[repr(simd)]
+#[repr(simd, packed)]
 pub struct Simd<T, const N: usize>([T; N])
 where
    LaneCount<N>: SupportedLaneCount,
@ -144,14 +144,32 @@ where
    /// assert_eq!(v.as_array(), &[8, 8, 8, 8]);
    /// ```
    #[inline]
-    pub fn splat(value: T) -> Self {
-        // This is preferred over `[value; N]`, since it's explicitly a splat:
-        // https://github.com/rust-lang/rust/issues/97804
-        struct Splat;
-        impl<const N: usize> Swizzle<N> for Splat {
-            const INDEX: [usize; N] = [0; N];
+    #[rustc_const_unstable(feature = "portable_simd", issue = "86656")]
+    pub const fn splat(value: T) -> Self {
+        const fn splat_const<T, const N: usize>(value: T) -> Simd<T, N>
+        where
+            T: SimdElement,
+            LaneCount<N>: SupportedLaneCount,
+        {
+            Simd::from_array([value; N])
        }
-        Splat::swizzle::<T, 1>(Simd::<T, 1>::from([value]))
+
+        fn splat_rt<T, const N: usize>(value: T) -> Simd<T, N>
+        where
+            T: SimdElement,
+            LaneCount<N>: SupportedLaneCount,
+        {
+            // This is preferred over `[value; N]`, since it's explicitly a splat:
+            // https://github.com/rust-lang/rust/issues/97804
+            struct Splat;
+            impl<const N: usize> Swizzle<N> for Splat {
+                const INDEX: [usize; N] = [0; N];
+            }
+
+            Splat::swizzle::<T, 1>(Simd::<T, 1>::from([value]))
+        }
+
+        core::intrinsics::const_eval_select((value,), splat_const, splat_rt)
    }

    /// Returns an array reference containing the entire SIMD vector.
@ -425,6 +443,9 @@ where
    ///
    /// When the element is disabled, that memory location is not accessed and the corresponding
    /// value from `or` is passed through.
+    ///
+    /// # Safety
+    /// Enabled loads must not exceed the length of `slice`.
    #[must_use]
    #[inline]
    pub unsafe fn load_select_unchecked(
@ -442,6 +463,9 @@ where
    ///
    /// When the element is disabled, that memory location is not accessed and the corresponding
    /// value from `or` is passed through.
+    ///
+    /// # Safety
+    /// Enabled `ptr` elements must be safe to read as if by `std::ptr::read`.
    #[must_use]
    #[inline]
    pub unsafe fn load_select_ptr(
@ -924,6 +948,7 @@ where
    }
 }

+/// Lexicographic order. For the SIMD elementwise minimum and maximum, use simd_min and simd_max instead.
 impl<T, const N: usize> PartialOrd for Simd<T, N>
 where
    LaneCount<N>: SupportedLaneCount,
@ -943,6 +968,7 @@ where
 {
 }

+/// Lexicographic order. For the SIMD elementwise minimum and maximum, use simd_min and simd_max instead.
 impl<T, const N: usize> Ord for Simd<T, N>
 where
    LaneCount<N>: SupportedLaneCount,
@ -1195,6 +1221,7 @@ fn lane_indices<const N: usize>() -> Simd<usize, N>
 where
    LaneCount<N>: SupportedLaneCount,
 {
+    #![allow(clippy::needless_range_loop)]
    let mut index = [0; N];
    for i in 0..N {
        index[i] = i;
--- a/library/portable-simd/crates/core_simd/src/vendor.rs
+++ b/library/portable-simd/crates/core_simd/src/vendor.rs
@ -29,3 +29,6 @@ mod arm;

 #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
 mod powerpc;
+
+#[cfg(target_arch = "loongarch64")]
+mod loongarch64;
--- a/library/portable-simd/crates/core_simd/src/vendor/loongarch64.rs
+++ b/library/portable-simd/crates/core_simd/src/vendor/loongarch64.rs
@ -0,0 +1,31 @@
+use crate::simd::*;
+use core::arch::loongarch64::*;
+
+from_transmute! { unsafe u8x16 => v16u8 }
+from_transmute! { unsafe u8x32 => v32u8 }
+from_transmute! { unsafe i8x16 => v16i8 }
+from_transmute! { unsafe i8x32 => v32i8 }
+
+from_transmute! { unsafe u16x8 => v8u16 }
+from_transmute! { unsafe u16x16 => v16u16 }
+from_transmute! { unsafe i16x8 => v8i16 }
+from_transmute! { unsafe i16x16 => v16i16 }
+
+from_transmute! { unsafe u32x4 => v4u32 }
+from_transmute! { unsafe u32x8 => v8u32 }
+from_transmute! { unsafe i32x4 => v4i32 }
+from_transmute! { unsafe i32x8 => v8i32 }
+from_transmute! { unsafe f32x4 => v4f32 }
+from_transmute! { unsafe f32x8 => v8f32 }
+
+from_transmute! { unsafe u64x2 => v2u64 }
+from_transmute! { unsafe u64x4 => v4u64 }
+from_transmute! { unsafe i64x2 => v2i64 }
+from_transmute! { unsafe i64x4 => v4i64 }
+from_transmute! { unsafe f64x2 => v2f64 }
+from_transmute! { unsafe f64x4 => v4f64 }
+
+from_transmute! { unsafe usizex2 => v2u64 }
+from_transmute! { unsafe usizex4 => v4u64 }
+from_transmute! { unsafe isizex2 => v2i64 }
+from_transmute! { unsafe isizex4 => v4i64 }
--- a/library/portable-simd/crates/core_simd/tests/layout.rs
+++ b/library/portable-simd/crates/core_simd/tests/layout.rs
@ -0,0 +1,35 @@
+#![feature(portable_simd)]
+
+macro_rules! layout_tests {
+    { $($mod:ident, $ty:ty,)* } => {
+        $(
+        mod $mod {
+            test_helpers::test_lanes! {
+                fn no_padding<const LANES: usize>() {
+                    assert_eq!(
+                        core::mem::size_of::<core_simd::simd::Simd::<$ty, LANES>>(),
+                        core::mem::size_of::<[$ty; LANES]>(),
+                    );
+                }
+            }
+        }
+        )*
+    }
+}
+
+layout_tests! {
+    i8, i8,
+    i16, i16,
+    i32, i32,
+    i64, i64,
+    isize, isize,
+    u8, u8,
+    u16, u16,
+    u32, u32,
+    u64, u64,
+    usize, usize,
+    f32, f32,
+    f64, f64,
+    mut_ptr, *mut (),
+    const_ptr, *const (),
+}
--- a/library/portable-simd/crates/core_simd/tests/masks.rs
+++ b/library/portable-simd/crates/core_simd/tests/masks.rs
@ -99,7 +99,6 @@ macro_rules! test_mask_api {
                assert_eq!(Mask::<$type, 2>::from_bitmask(bitmask), mask);
            }

-            #[cfg(feature = "all_lane_counts")]
            #[test]
            fn roundtrip_bitmask_conversion_odd() {
                let values = [
@ -134,48 +133,6 @@ macro_rules! test_mask_api {
                cast_impl::<i64>();
                cast_impl::<isize>();
            }
-
-            #[test]
-            fn roundtrip_bitmask_vector_conversion() {
-                use core_simd::simd::ToBytes;
-                let values = [
-                    true, false, false, true, false, false, true, false,
-                    true, true, false, false, false, false, false, true,
-                ];
-                let mask = Mask::<$type, 16>::from_array(values);
-                let bitmask = mask.to_bitmask_vector();
-                assert_eq!(bitmask.resize::<2>(0).to_ne_bytes()[..2], [0b01001001, 0b10000011]);
-                assert_eq!(Mask::<$type, 16>::from_bitmask_vector(bitmask), mask);
-            }
-
-            // rust-lang/portable-simd#379
-            #[test]
-            fn roundtrip_bitmask_vector_conversion_small() {
-                use core_simd::simd::ToBytes;
-                let values = [
-                    true, false, true, true
-                ];
-                let mask = Mask::<$type, 4>::from_array(values);
-                let bitmask = mask.to_bitmask_vector();
-                assert_eq!(bitmask.resize::<1>(0).to_ne_bytes()[0], 0b00001101);
-                assert_eq!(Mask::<$type, 4>::from_bitmask_vector(bitmask), mask);
-            }
-
-            /* FIXME doesn't work with non-powers-of-two, yet
-            // rust-lang/portable-simd#379
-            #[cfg(feature = "all_lane_counts")]
-            #[test]
-            fn roundtrip_bitmask_vector_conversion_odd() {
-                use core_simd::simd::ToBytes;
-                let values = [
-                    true, false, true, false, true, true, false, false, false, true, true,
-                ];
-                let mask = Mask::<$type, 11>::from_array(values);
-                let bitmask = mask.to_bitmask_vector();
-                assert_eq!(bitmask.resize::<2>(0).to_ne_bytes()[..2], [0b00110101, 0b00000110]);
-                assert_eq!(Mask::<$type, 11>::from_bitmask_vector(bitmask), mask);
-            }
-            */
        }
    }
 }
--- a/library/portable-simd/crates/core_simd/tests/ops_macros.rs
+++ b/library/portable-simd/crates/core_simd/tests/ops_macros.rs
@ -216,6 +216,22 @@ macro_rules! impl_common_integer_tests {
                )
            }

+            fn count_ones<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &$vector::<LANES>::count_ones,
+                    &|x| x.count_ones() as _,
+                    &|_| true,
+                )
+            }
+
+            fn count_zeros<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &$vector::<LANES>::count_zeros,
+                    &|x| x.count_zeros() as _,
+                    &|_| true,
+                )
+            }
+
            fn leading_zeros<const LANES: usize>() {
                test_helpers::test_unary_elementwise(
                    &$vector::<LANES>::leading_zeros,
@ -307,6 +323,14 @@ macro_rules! impl_signed_tests {
                    assert_eq!(a % b, Vector::<LANES>::splat(0));
                }

+                fn abs_diff<const LANES: usize>() {
+                    test_helpers::test_binary_elementwise(
+                        &Vector::<LANES>::abs_diff,
+                        &Scalar::abs_diff,
+                        &|_, _| true,
+                    )
+                }
+
                fn simd_min<const LANES: usize>() {
                    use core_simd::simd::cmp::SimdOrd;
                    let a = Vector::<LANES>::splat(Scalar::MIN);
@ -419,6 +443,14 @@ macro_rules! impl_unsigned_tests {
                        &|_| true,
                    );
                }
+
+                fn abs_diff<const LANES: usize>() {
+                    test_helpers::test_binary_elementwise(
+                        &Vector::<LANES>::abs_diff,
+                        &Scalar::abs_diff,
+                        &|_, _| true,
+                    )
+                }
            }

            impl_binary_op_test!(Scalar, Add::add, AddAssign::add_assign, Scalar::wrapping_add);
@ -495,6 +527,9 @@ macro_rules! impl_float_tests {
                }

                fn is_normal<const LANES: usize>() {
+                    // Arm v7 Neon violates float opsem re: subnormals, see
+                    // https://github.com/rust-lang/portable-simd/issues/439
+                    #[cfg(not(target_arch = "arm"))]
                    test_helpers::test_unary_mask_elementwise(
                        &Vector::<LANES>::is_normal,
                        &Scalar::is_normal,
@ -503,6 +538,9 @@ macro_rules! impl_float_tests {
                }

                fn is_subnormal<const LANES: usize>() {
+                    // Arm v7 Neon violates float opsem re: subnormals, see
+                    // https://github.com/rust-lang/portable-simd/issues/439
+                    #[cfg(not(target_arch = "arm"))]
                    test_helpers::test_unary_mask_elementwise(
                        &Vector::<LANES>::is_subnormal,
                        &Scalar::is_subnormal,
--- a/library/portable-simd/crates/core_simd/tests/swizzle.rs
+++ b/library/portable-simd/crates/core_simd/tests/swizzle.rs
@ -48,6 +48,24 @@ fn rotate() {
    assert_eq!(a.rotate_elements_right::<5>().to_array(), [4, 1, 2, 3]);
 }

+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn shift() {
+    let a = Simd::from_array([1, 2, 3, 4]);
+    assert_eq!(a.shift_elements_left::<0>(0).to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.shift_elements_left::<1>(0).to_array(), [2, 3, 4, 0]);
+    assert_eq!(a.shift_elements_left::<2>(9).to_array(), [3, 4, 9, 9]);
+    assert_eq!(a.shift_elements_left::<3>(8).to_array(), [4, 8, 8, 8]);
+    assert_eq!(a.shift_elements_left::<4>(7).to_array(), [7, 7, 7, 7]);
+    assert_eq!(a.shift_elements_left::<5>(6).to_array(), [6, 6, 6, 6]);
+    assert_eq!(a.shift_elements_right::<0>(0).to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.shift_elements_right::<1>(0).to_array(), [0, 1, 2, 3]);
+    assert_eq!(a.shift_elements_right::<2>(-1).to_array(), [-1, -1, 1, 2]);
+    assert_eq!(a.shift_elements_right::<3>(-2).to_array(), [-2, -2, -2, 1]);
+    assert_eq!(a.shift_elements_right::<4>(-3).to_array(), [-3, -3, -3, -3]);
+    assert_eq!(a.shift_elements_right::<5>(-4).to_array(), [-4, -4, -4, -4]);
+}
+
 #[test]
 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
 fn interleave() {
--- a/library/portable-simd/crates/test_helpers/Cargo.toml
+++ b/library/portable-simd/crates/test_helpers/Cargo.toml
@ -6,6 +6,3 @@ publish = false

 [dependencies]
 proptest = { version = "0.10", default-features = false, features = ["alloc"] }
-
-[features]
-all_lane_counts = []
--- a/library/portable-simd/crates/test_helpers/src/lib.rs
+++ b/library/portable-simd/crates/test_helpers/src/lib.rs
@ -539,32 +539,22 @@ macro_rules! test_lanes {
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
                    lanes_1 1;
                    lanes_2 2;
-                    lanes_4 4;
-                );
-
-                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
-                $crate::test_lanes_helper!(
-                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
-                    lanes_8 8;
-                    lanes_16 16;
-                    lanes_32 32;
-                    lanes_64 64;
-                );
-
-                #[cfg(feature = "all_lane_counts")]
-                $crate::test_lanes_helper!(
-                    // test some odd and even non-power-of-2 lengths on miri
-                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
+                    // Cover an odd and an even non-power-of-2 length in Miri.
+                    // (Even non-power-of-2 vectors have alignment between element
+                    // and vector size, so we want to cover that case as well.)
                    lanes_3 3;
-                    lanes_5 5;
+
                    lanes_6 6;
                );

-                #[cfg(feature = "all_lane_counts")]
                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
                $crate::test_lanes_helper!(
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
+                    lanes_4 4;
+                    lanes_5 5;
+
                    lanes_7 7;
+                    lanes_8 8;
                    lanes_9 9;
                    lanes_10 10;
                    lanes_11 11;
@ -572,52 +562,55 @@ macro_rules! test_lanes {
                    lanes_13 13;
                    lanes_14 14;
                    lanes_15 15;
+                    lanes_16 16;
                    lanes_17 17;
-                    lanes_18 18;
-                    lanes_19 19;
-                    lanes_20 20;
-                    lanes_21 21;
-                    lanes_22 22;
-                    lanes_23 23;
+                    //lanes_18 18;
+                    //lanes_19 19;
+                    //lanes_20 20;
+                    //lanes_21 21;
+                    //lanes_22 22;
+                    //lanes_23 23;
                    lanes_24 24;
-                    lanes_25 25;
-                    lanes_26 26;
-                    lanes_27 27;
-                    lanes_28 28;
-                    lanes_29 29;
-                    lanes_30 30;
-                    lanes_31 31;
-                    lanes_33 33;
-                    lanes_34 34;
-                    lanes_35 35;
-                    lanes_36 36;
-                    lanes_37 37;
-                    lanes_38 38;
-                    lanes_39 39;
-                    lanes_40 40;
-                    lanes_41 41;
-                    lanes_42 42;
-                    lanes_43 43;
-                    lanes_44 44;
-                    lanes_45 45;
-                    lanes_46 46;
+                    //lanes_25 25;
+                    //lanes_26 26;
+                    //lanes_27 27;
+                    //lanes_28 28;
+                    //lanes_29 29;
+                    //lanes_30 30;
+                    //lanes_31 31;
+                    lanes_32 32;
+                    //lanes_33 33;
+                    //lanes_34 34;
+                    //lanes_35 35;
+                    //lanes_36 36;
+                    //lanes_37 37;
+                    //lanes_38 38;
+                    //lanes_39 39;
+                    //lanes_40 40;
+                    //lanes_41 41;
+                    //lanes_42 42;
+                    //lanes_43 43;
+                    //lanes_44 44;
+                    //lanes_45 45;
+                    //lanes_46 46;
                    lanes_47 47;
-                    lanes_48 48;
-                    lanes_49 49;
-                    lanes_50 50;
-                    lanes_51 51;
-                    lanes_52 52;
-                    lanes_53 53;
-                    lanes_54 54;
-                    lanes_55 55;
+                    //lanes_48 48;
+                    //lanes_49 49;
+                    //lanes_50 50;
+                    //lanes_51 51;
+                    //lanes_52 52;
+                    //lanes_53 53;
+                    //lanes_54 54;
+                    //lanes_55 55;
                    lanes_56 56;
                    lanes_57 57;
-                    lanes_58 58;
-                    lanes_59 59;
-                    lanes_60 60;
-                    lanes_61 61;
-                    lanes_62 62;
+                    //lanes_58 58;
+                    //lanes_59 59;
+                    //lanes_60 60;
+                    //lanes_61 61;
+                    //lanes_62 62;
                    lanes_63 63;
+                    lanes_64 64;
                );
            }
        )*
@ -639,36 +632,24 @@ macro_rules! test_lanes_panic {
                    core_simd::simd::LaneCount<$lanes>: core_simd::simd::SupportedLaneCount,
                $body

+                // test some odd and even non-power-of-2 lengths on miri
                $crate::test_lanes_helper!(
                    #[should_panic];
                    lanes_1 1;
                    lanes_2 2;
-                    lanes_4 4;
-                );
-
-                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
-                $crate::test_lanes_helper!(
-                    #[should_panic];
-                    lanes_8 8;
-                    lanes_16 16;
-                    lanes_32 32;
-                    lanes_64 64;
-                );
-
-                #[cfg(feature = "all_lane_counts")]
-                $crate::test_lanes_helper!(
-                    // test some odd and even non-power-of-2 lengths on miri
-                    #[should_panic];
                    lanes_3 3;
-                    lanes_5 5;
+
                    lanes_6 6;
                );

-                #[cfg(feature = "all_lane_counts")]
                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
                $crate::test_lanes_helper!(
                    #[should_panic];
+                    lanes_4 4;
+                    lanes_5 5;
+
                    lanes_7 7;
+                    lanes_8 8;
                    lanes_9 9;
                    lanes_10 10;
                    lanes_11 11;
@ -676,52 +657,55 @@ macro_rules! test_lanes_panic {
                    lanes_13 13;
                    lanes_14 14;
                    lanes_15 15;
+                    lanes_16 16;
                    lanes_17 17;
-                    lanes_18 18;
-                    lanes_19 19;
-                    lanes_20 20;
-                    lanes_21 21;
-                    lanes_22 22;
-                    lanes_23 23;
+                    //lanes_18 18;
+                    //lanes_19 19;
+                    //lanes_20 20;
+                    //lanes_21 21;
+                    //lanes_22 22;
+                    //lanes_23 23;
                    lanes_24 24;
-                    lanes_25 25;
-                    lanes_26 26;
-                    lanes_27 27;
-                    lanes_28 28;
-                    lanes_29 29;
-                    lanes_30 30;
-                    lanes_31 31;
-                    lanes_33 33;
-                    lanes_34 34;
-                    lanes_35 35;
-                    lanes_36 36;
-                    lanes_37 37;
-                    lanes_38 38;
-                    lanes_39 39;
-                    lanes_40 40;
-                    lanes_41 41;
-                    lanes_42 42;
-                    lanes_43 43;
-                    lanes_44 44;
-                    lanes_45 45;
-                    lanes_46 46;
+                    //lanes_25 25;
+                    //lanes_26 26;
+                    //lanes_27 27;
+                    //lanes_28 28;
+                    //lanes_29 29;
+                    //lanes_30 30;
+                    //lanes_31 31;
+                    lanes_32 32;
+                    //lanes_33 33;
+                    //lanes_34 34;
+                    //lanes_35 35;
+                    //lanes_36 36;
+                    //lanes_37 37;
+                    //lanes_38 38;
+                    //lanes_39 39;
+                    //lanes_40 40;
+                    //lanes_41 41;
+                    //lanes_42 42;
+                    //lanes_43 43;
+                    //lanes_44 44;
+                    //lanes_45 45;
+                    //lanes_46 46;
                    lanes_47 47;
-                    lanes_48 48;
-                    lanes_49 49;
-                    lanes_50 50;
-                    lanes_51 51;
-                    lanes_52 52;
-                    lanes_53 53;
-                    lanes_54 54;
-                    lanes_55 55;
+                    //lanes_48 48;
+                    //lanes_49 49;
+                    //lanes_50 50;
+                    //lanes_51 51;
+                    //lanes_52 52;
+                    //lanes_53 53;
+                    //lanes_54 54;
+                    //lanes_55 55;
                    lanes_56 56;
                    lanes_57 57;
-                    lanes_58 58;
-                    lanes_59 59;
-                    lanes_60 60;
-                    lanes_61 61;
-                    lanes_62 62;
+                    //lanes_58 58;
+                    //lanes_59 59;
+                    //lanes_60 60;
+                    //lanes_61 61;
+                    //lanes_62 62;
                    lanes_63 63;
+                    lanes_64 64;
                );
            }
        )*