From af2bc4e3efd94e2a0e34f0f6de0cf36c37e239ab Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Thu, 2 Dec 2021 18:27:47 -0800
Subject: [PATCH 01/35] Merge commit 'a8385522ade6f67853edac730b5bf164ddb298fd'
 into simd-remove-autosplats

---
 CONTRIBUTING.md                          |   2 +-
 README.md                                |   2 +-
 crates/core_simd/examples/nbody.rs       |  10 +-
 crates/core_simd/src/comparisons.rs      |   6 +
 crates/core_simd/src/lane_count.rs       |  12 +-
 crates/core_simd/src/masks.rs            |  24 ++
 crates/core_simd/src/masks/bitmask.rs    |  33 +-
 crates/core_simd/src/masks/full_masks.rs |  34 +-
 crates/core_simd/src/math.rs             |   8 +-
 crates/core_simd/src/ops.rs              | 419 +----------------------
 crates/core_simd/src/ops/assign.rs       | 124 +++++++
 crates/core_simd/src/ops/deref.rs        | 124 +++++++
 crates/core_simd/src/ops/unary.rs        |  77 +++++
 crates/core_simd/src/reduction.rs        |  66 ++--
 crates/core_simd/src/select.rs           |   3 +
 crates/core_simd/src/swizzle.rs          |   9 +
 crates/core_simd/src/vector/float.rs     |  20 ++
 crates/core_simd/src/vector/ptr.rs       |   4 +-
 crates/core_simd/src/vendor/x86.rs       |   4 +-
 crates/core_simd/tests/autoderef.rs      |  22 ++
 crates/core_simd/tests/ops_macros.rs     |  48 ---
 crates/test_helpers/src/lib.rs           |  12 +
 22 files changed, 539 insertions(+), 524 deletions(-)
 create mode 100644 crates/core_simd/src/ops/assign.rs
 create mode 100644 crates/core_simd/src/ops/deref.rs
 create mode 100644 crates/core_simd/src/ops/unary.rs
 create mode 100644 crates/core_simd/tests/autoderef.rs

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f9ba12d3a1b..9612fe871c6 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,7 +15,7 @@ SIMD can be quite complex, and even a "simple" issue can be huge. If an issue is
 
 ## CI
 
-We currently have 2 CI matrices through Travis CI and GitHub Actions that will automatically build and test your change in order to verify that `std::simd`'s portable API is, in fact, portable. If your change builds locally, but does not build on either, this is likely due to a platform-specific concern that your code has not addressed. Please consult the build logs and address the error, or ask for help if you need it.
+We currently use GitHub Actions which will automatically build and test your change in order to verify that `std::simd`'s portable API is, in fact, portable. If your change builds locally, but does not build in CI, this is likely due to a platform-specific concern that your code has not addressed. Please consult the build logs and address the error, or ask for help if you need it.
 
 ## Beyond stdsimd
 
diff --git a/README.md b/README.md
index da536a4d6f2..db0af2da606 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # The Rust standard library's portable SIMD API
-[![Build Status](https://travis-ci.com/rust-lang/portable-simd.svg?branch=master)](https://travis-ci.com/rust-lang/portable-simd)
+![Build Status](https://github.com/rust-lang/portable-simd/actions/workflows/ci.yml/badge.svg?branch=master)
 
 Code repository for the [Portable SIMD Project Group](https://github.com/rust-lang/project-portable-simd).
 Please refer to [CONTRIBUTING.md](./CONTRIBUTING.md) for our contributing guidelines.
diff --git a/crates/core_simd/examples/nbody.rs b/crates/core_simd/examples/nbody.rs
index 779575985ed..43280feebbd 100644
--- a/crates/core_simd/examples/nbody.rs
+++ b/crates/core_simd/examples/nbody.rs
@@ -97,7 +97,7 @@ mod nbody {
         let sun = &mut sun[0];
         for body in rest {
             let m_ratio = body.mass / SOLAR_MASS;
-            sun.v -= body.v * m_ratio;
+            sun.v -= body.v * Simd::splat(m_ratio);
         }
     }
 
@@ -143,14 +143,14 @@ mod nbody {
         let mut i = 0;
         for j in 0..N_BODIES {
             for k in j + 1..N_BODIES {
-                let f = r[i] * mag[i];
-                bodies[j].v -= f * bodies[k].mass;
-                bodies[k].v += f * bodies[j].mass;
+                let f = r[i] * Simd::splat(mag[i]);
+                bodies[j].v -= f * Simd::splat(bodies[k].mass);
+                bodies[k].v += f * Simd::splat(bodies[j].mass);
                 i += 1
             }
         }
         for body in bodies {
-            body.x += dt * body.v
+            body.x += Simd::splat(dt) * body.v
         }
     }
 
diff --git a/crates/core_simd/src/comparisons.rs b/crates/core_simd/src/comparisons.rs
index 8c51baca8ed..edef5af3687 100644
--- a/crates/core_simd/src/comparisons.rs
+++ b/crates/core_simd/src/comparisons.rs
@@ -8,12 +8,14 @@ where
 {
     /// Test if each lane is equal to the corresponding lane in `other`.
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_eq(self, other: Self) -> Mask<T::Mask, LANES> {
         unsafe { Mask::from_int_unchecked(intrinsics::simd_eq(self, other)) }
     }
 
     /// Test if each lane is not equal to the corresponding lane in `other`.
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_ne(self, other: Self) -> Mask<T::Mask, LANES> {
         unsafe { Mask::from_int_unchecked(intrinsics::simd_ne(self, other)) }
     }
@@ -26,24 +28,28 @@ where
 {
     /// Test if each lane is less than the corresponding lane in `other`.
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_lt(self, other: Self) -> Mask<T::Mask, LANES> {
         unsafe { Mask::from_int_unchecked(intrinsics::simd_lt(self, other)) }
     }
 
     /// Test if each lane is greater than the corresponding lane in `other`.
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_gt(self, other: Self) -> Mask<T::Mask, LANES> {
         unsafe { Mask::from_int_unchecked(intrinsics::simd_gt(self, other)) }
     }
 
     /// Test if each lane is less than or equal to the corresponding lane in `other`.
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_le(self, other: Self) -> Mask<T::Mask, LANES> {
         unsafe { Mask::from_int_unchecked(intrinsics::simd_le(self, other)) }
     }
 
     /// Test if each lane is greater than or equal to the corresponding lane in `other`.
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_ge(self, other: Self) -> Mask<T::Mask, LANES> {
         unsafe { Mask::from_int_unchecked(intrinsics::simd_ge(self, other)) }
     }
diff --git a/crates/core_simd/src/lane_count.rs b/crates/core_simd/src/lane_count.rs
index b017e7d137e..3b316f12b3e 100644
--- a/crates/core_simd/src/lane_count.rs
+++ b/crates/core_simd/src/lane_count.rs
@@ -15,34 +15,28 @@ impl<const LANES: usize> LaneCount<LANES> {
 pub trait SupportedLaneCount: Sealed {
     #[doc(hidden)]
     type BitMask: Copy + Default + AsRef<[u8]> + AsMut<[u8]>;
-
-    #[doc(hidden)]
-    type IntBitMask;
 }
 
 impl<const LANES: usize> Sealed for LaneCount<LANES> {}
 
 impl SupportedLaneCount for LaneCount<1> {
     type BitMask = [u8; 1];
-    type IntBitMask = u8;
 }
 impl SupportedLaneCount for LaneCount<2> {
     type BitMask = [u8; 1];
-    type IntBitMask = u8;
 }
 impl SupportedLaneCount for LaneCount<4> {
     type BitMask = [u8; 1];
-    type IntBitMask = u8;
 }
 impl SupportedLaneCount for LaneCount<8> {
     type BitMask = [u8; 1];
-    type IntBitMask = u8;
 }
 impl SupportedLaneCount for LaneCount<16> {
     type BitMask = [u8; 2];
-    type IntBitMask = u16;
 }
 impl SupportedLaneCount for LaneCount<32> {
     type BitMask = [u8; 4];
-    type IntBitMask = u32;
+}
+impl SupportedLaneCount for LaneCount<64> {
+    type BitMask = [u8; 8];
 }
diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index d460da0d04f..191e9690313 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -129,6 +129,7 @@ where
     /// # Safety
     /// All lanes must be either 0 or -1.
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub unsafe fn from_int_unchecked(value: Simd<T, LANES>) -> Self {
         unsafe { Self(mask_impl::Mask::from_int_unchecked(value)) }
     }
@@ -139,6 +140,7 @@ where
     /// # Panics
     /// Panics if any lane is not 0 or -1.
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn from_int(value: Simd<T, LANES>) -> Self {
         assert!(T::valid(value), "all values must be either 0 or -1",);
         unsafe { Self::from_int_unchecked(value) }
@@ -147,6 +149,7 @@ where
     /// Converts the mask to a vector of integers, where 0 represents `false` and -1
     /// represents `true`.
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original value"]
     pub fn to_int(self) -> Simd<T, LANES> {
         self.0.to_int()
     }
@@ -156,6 +159,7 @@ where
     /// # Safety
     /// `lane` must be less than `LANES`.
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub unsafe fn test_unchecked(&self, lane: usize) -> bool {
         unsafe { self.0.test_unchecked(lane) }
     }
@@ -165,6 +169,7 @@ where
     /// # Panics
     /// Panics if `lane` is greater than or equal to the number of lanes in the vector.
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn test(&self, lane: usize) -> bool {
         assert!(lane < LANES, "lane index out of range");
         unsafe { self.test_unchecked(lane) }
@@ -195,24 +200,30 @@ where
 
     /// Convert this mask to a bitmask, with one bit set per lane.
     #[cfg(feature = "generic_const_exprs")]
+    #[inline]
+    #[must_use = "method returns a new array and does not mutate the original value"]
     pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
         self.0.to_bitmask()
     }
 
     /// Convert a bitmask to a mask.
     #[cfg(feature = "generic_const_exprs")]
+    #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn from_bitmask(bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN]) -> Self {
         Self(mask_impl::Mask::from_bitmask(bitmask))
     }
 
     /// Returns true if any lane is set, or false otherwise.
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn any(self) -> bool {
         self.0.any()
     }
 
     /// Returns true if all lanes are set, or false otherwise.
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn all(self) -> bool {
         self.0.all()
     }
@@ -245,6 +256,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     #[inline]
+    #[must_use = "method returns a defaulted mask with all lanes set to false (0)"]
     fn default() -> Self {
         Self::splat(false)
     }
@@ -256,6 +268,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     fn eq(&self, other: &Self) -> bool {
         self.0 == other.0
     }
@@ -267,6 +280,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     #[inline]
+    #[must_use = "method returns a new Ordering and does not mutate the original value"]
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         self.0.partial_cmp(&other.0)
     }
@@ -291,6 +305,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(self, rhs: Self) -> Self {
         Self(self.0 & rhs.0)
     }
@@ -303,6 +318,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(self, rhs: bool) -> Self {
         self & Self::splat(rhs)
     }
@@ -315,6 +331,7 @@ where
 {
     type Output = Mask<T, LANES>;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(self, rhs: Mask<T, LANES>) -> Mask<T, LANES> {
         Mask::splat(self) & rhs
     }
@@ -327,6 +344,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(self, rhs: Self) -> Self {
         Self(self.0 | rhs.0)
     }
@@ -339,6 +357,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(self, rhs: bool) -> Self {
         self | Self::splat(rhs)
     }
@@ -351,6 +370,7 @@ where
 {
     type Output = Mask<T, LANES>;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(self, rhs: Mask<T, LANES>) -> Mask<T, LANES> {
         Mask::splat(self) | rhs
     }
@@ -363,6 +383,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(self, rhs: Self) -> Self::Output {
         Self(self.0 ^ rhs.0)
     }
@@ -375,6 +396,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(self, rhs: bool) -> Self::Output {
         self ^ Self::splat(rhs)
     }
@@ -387,6 +409,7 @@ where
 {
     type Output = Mask<T, LANES>;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(self, rhs: Mask<T, LANES>) -> Self::Output {
         Mask::splat(self) ^ rhs
     }
@@ -399,6 +422,7 @@ where
 {
     type Output = Mask<T, LANES>;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn not(self) -> Self::Output {
         Self(!self.0)
     }
diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index 2689e1a88a8..4c964cb52e1 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -1,3 +1,4 @@
+#![allow(unused_imports)]
 use super::MaskElement;
 use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Simd, SupportedLaneCount};
@@ -73,6 +74,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn splat(value: bool) -> Self {
         let mut mask = <LaneCount<LANES> as SupportedLaneCount>::BitMask::default();
         if value {
@@ -87,6 +89,7 @@ where
     }
 
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub unsafe fn test_unchecked(&self, lane: usize) -> bool {
         (self.0.as_ref()[lane / 8] >> (lane % 8)) & 0x1 > 0
     }
@@ -99,30 +102,26 @@ where
     }
 
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original value"]
     pub fn to_int(self) -> Simd<T, LANES> {
         unsafe {
-            let mask: <LaneCount<LANES> as SupportedLaneCount>::IntBitMask =
-                core::mem::transmute_copy(&self);
-            intrinsics::simd_select_bitmask(mask, Simd::splat(T::TRUE), Simd::splat(T::FALSE))
+            crate::intrinsics::simd_select_bitmask(
+                self.0,
+                Simd::splat(T::TRUE),
+                Simd::splat(T::FALSE),
+            )
         }
     }
 
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub unsafe fn from_int_unchecked(value: Simd<T, LANES>) -> Self {
-        // TODO remove the transmute when rustc is more flexible
-        assert_eq!(
-            core::mem::size_of::<<LaneCount::<LANES> as SupportedLaneCount>::BitMask>(),
-            core::mem::size_of::<<LaneCount::<LANES> as SupportedLaneCount>::IntBitMask>(),
-        );
-        unsafe {
-            let mask: <LaneCount<LANES> as SupportedLaneCount>::IntBitMask =
-                intrinsics::simd_bitmask(value);
-            Self(core::mem::transmute_copy(&mask), PhantomData)
-        }
+        unsafe { Self(crate::intrinsics::simd_bitmask(value), PhantomData) }
     }
 
     #[cfg(feature = "generic_const_exprs")]
     #[inline]
+    #[must_use = "method returns a new array and does not mutate the original value"]
     pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
         // Safety: these are the same type and we are laundering the generic
         unsafe { core::mem::transmute_copy(&self.0) }
@@ -130,12 +129,14 @@ where
 
     #[cfg(feature = "generic_const_exprs")]
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn from_bitmask(bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN]) -> Self {
         // Safety: these are the same type and we are laundering the generic
         Self(unsafe { core::mem::transmute_copy(&bitmask) }, PhantomData)
     }
 
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn convert<U>(self) -> Mask<U, LANES>
     where
         U: MaskElement,
@@ -144,11 +145,13 @@ where
     }
 
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn any(self) -> bool {
         self != Self::splat(false)
     }
 
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn all(self) -> bool {
         self == Self::splat(true)
     }
@@ -162,6 +165,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(mut self, rhs: Self) -> Self {
         for (l, r) in self.0.as_mut().iter_mut().zip(rhs.0.as_ref().iter()) {
             *l &= r;
@@ -178,6 +182,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(mut self, rhs: Self) -> Self {
         for (l, r) in self.0.as_mut().iter_mut().zip(rhs.0.as_ref().iter()) {
             *l |= r;
@@ -193,6 +198,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(mut self, rhs: Self) -> Self::Output {
         for (l, r) in self.0.as_mut().iter_mut().zip(rhs.0.as_ref().iter()) {
             *l ^= r;
@@ -208,6 +214,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn not(mut self) -> Self::Output {
         for x in self.0.as_mut() {
             *x = !*x;
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index dd981cedb93..5421ccbe3d8 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -23,6 +23,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn clone(&self) -> Self {
         *self
     }
@@ -70,11 +71,14 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn splat(value: bool) -> Self {
         Self(Simd::splat(if value { T::TRUE } else { T::FALSE }))
     }
 
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub unsafe fn test_unchecked(&self, lane: usize) -> bool {
         T::eq(self.0[lane], T::TRUE)
     }
@@ -85,16 +89,19 @@ where
     }
 
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original value"]
     pub fn to_int(self) -> Simd<T, LANES> {
         self.0
     }
 
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub unsafe fn from_int_unchecked(value: Simd<T, LANES>) -> Self {
         Self(value)
     }
 
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn convert<U>(self) -> Mask<U, LANES>
     where
         U: MaskElement,
@@ -104,17 +111,11 @@ where
 
     #[cfg(feature = "generic_const_exprs")]
     #[inline]
+    #[must_use = "method returns a new array and does not mutate the original value"]
     pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
         unsafe {
-            // TODO remove the transmute when rustc can use arrays of u8 as bitmasks
-            assert_eq!(
-                core::mem::size_of::<<LaneCount::<LANES> as SupportedLaneCount>::IntBitMask>(),
-                LaneCount::<LANES>::BITMASK_LEN,
-            );
-            let bitmask: <LaneCount<LANES> as SupportedLaneCount>::IntBitMask =
-                intrinsics::simd_bitmask(self.0);
             let mut bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN] =
-                core::mem::transmute_copy(&bitmask);
+                crate::intrinsics::simd_bitmask(self.0);
 
             // There is a bug where LLVM appears to implement this operation with the wrong
             // bit order.
@@ -131,6 +132,7 @@ where
 
     #[cfg(feature = "generic_const_exprs")]
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn from_bitmask(mut bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN]) -> Self {
         unsafe {
             // There is a bug where LLVM appears to implement this operation with the wrong
@@ -142,15 +144,7 @@ where
                 }
             }
 
-            // TODO remove the transmute when rustc can use arrays of u8 as bitmasks
-            assert_eq!(
-                core::mem::size_of::<<LaneCount::<LANES> as SupportedLaneCount>::IntBitMask>(),
-                LaneCount::<LANES>::BITMASK_LEN,
-            );
-            let bitmask: <LaneCount<LANES> as SupportedLaneCount>::IntBitMask =
-                core::mem::transmute_copy(&bitmask);
-
-            Self::from_int_unchecked(intrinsics::simd_select_bitmask(
+            Self::from_int_unchecked(crate::intrinsics::simd_select_bitmask(
                 bitmask,
                 Self::splat(true).to_int(),
                 Self::splat(false).to_int(),
@@ -159,11 +153,13 @@ where
     }
 
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn any(self) -> bool {
         unsafe { intrinsics::simd_reduce_any(self.to_int()) }
     }
 
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original value"]
     pub fn all(self) -> bool {
         unsafe { intrinsics::simd_reduce_all(self.to_int()) }
     }
@@ -186,6 +182,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(self, rhs: Self) -> Self {
         unsafe { Self(intrinsics::simd_and(self.0, rhs.0)) }
     }
@@ -198,6 +195,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(self, rhs: Self) -> Self {
         unsafe { Self(intrinsics::simd_or(self.0, rhs.0)) }
     }
@@ -210,6 +208,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(self, rhs: Self) -> Self {
         unsafe { Self(intrinsics::simd_xor(self.0, rhs.0)) }
     }
@@ -222,6 +221,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn not(self) -> Self::Output {
         Self::splat(true) ^ self
     }
diff --git a/crates/core_simd/src/math.rs b/crates/core_simd/src/math.rs
index 2bae414ebfb..7435b6df918 100644
--- a/crates/core_simd/src/math.rs
+++ b/crates/core_simd/src/math.rs
@@ -17,7 +17,7 @@ macro_rules! impl_uint_arith {
             /// let max = Simd::splat(MAX);
             /// let unsat = x + max;
             /// let sat = x.saturating_add(max);
-            /// assert_eq!(x - 1, unsat);
+            /// assert_eq!(unsat, Simd::from_array([1, 0, MAX, MAX - 1]));
             /// assert_eq!(sat, max);
             /// ```
             #[inline]
@@ -37,7 +37,7 @@ macro_rules! impl_uint_arith {
             /// let max = Simd::splat(MAX);
             /// let unsat = x - max;
             /// let sat = x.saturating_sub(max);
-            /// assert_eq!(unsat, x + 1);
+            /// assert_eq!(unsat, Simd::from_array([3, 2, 1, 0]));
             /// assert_eq!(sat, Simd::splat(0));
             #[inline]
             pub fn saturating_sub(self, second: Self) -> Self {
@@ -105,7 +105,7 @@ macro_rules! impl_int_arith {
             #[inline]
             pub fn abs(self) -> Self {
                 const SHR: $ty = <$ty>::BITS as $ty - 1;
-                let m = self >> SHR;
+                let m = self >> Simd::splat(SHR);
                 (self^m) - m
             }
 
@@ -128,7 +128,7 @@ macro_rules! impl_int_arith {
             pub fn saturating_abs(self) -> Self {
                 // arith shift for -1 or 0 mask based on sign bit, giving 2s complement
                 const SHR: $ty = <$ty>::BITS as $ty - 1;
-                let m = self >> SHR;
+                let m = self >> Simd::splat(SHR);
                 (self^m).saturating_sub(m)
             }
 
diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index 5d7af474caf..3582c57870b 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -1,5 +1,13 @@
 use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+use core::ops::{Add, Mul};
+use core::ops::{BitAnd, BitOr, BitXor};
+use core::ops::{Div, Rem, Sub};
+use core::ops::{Shl, Shr};
+
+mod assign;
+mod deref;
+mod unary;
 
 impl<I, T, const LANES: usize> core::ops::Index<I> for Simd<T, LANES>
 where
@@ -57,166 +65,44 @@ macro_rules! impl_ref_ops {
             $(#[$attrs])*
             fn $fn($self_tok, $rhs_arg: $rhs_arg_ty) -> Self::Output $body
         }
-
-        impl<const $lanes: usize> core::ops::$trait<&'_ $rhs> for $type
-        where
-            LaneCount<$lanes2>: SupportedLaneCount,
-        {
-            type Output = <$type as core::ops::$trait<$rhs>>::Output;
-
-            $(#[$attrs])*
-            fn $fn($self_tok, $rhs_arg: &$rhs) -> Self::Output {
-                core::ops::$trait::$fn($self_tok, *$rhs_arg)
-            }
-        }
-
-        impl<const $lanes: usize> core::ops::$trait<$rhs> for &'_ $type
-        where
-            LaneCount<$lanes2>: SupportedLaneCount,
-        {
-            type Output = <$type as core::ops::$trait<$rhs>>::Output;
-
-            $(#[$attrs])*
-            fn $fn($self_tok, $rhs_arg: $rhs) -> Self::Output {
-                core::ops::$trait::$fn(*$self_tok, $rhs_arg)
-            }
-        }
-
-        impl<const $lanes: usize> core::ops::$trait<&'_ $rhs> for &'_ $type
-        where
-            LaneCount<$lanes2>: SupportedLaneCount,
-        {
-            type Output = <$type as core::ops::$trait<$rhs>>::Output;
-
-            $(#[$attrs])*
-            fn $fn($self_tok, $rhs_arg: &$rhs) -> Self::Output {
-                core::ops::$trait::$fn(*$self_tok, *$rhs_arg)
-            }
-        }
     };
-
-    // binary assignment op
-    {
-        impl<const $lanes:ident: usize> core::ops::$trait:ident<$rhs:ty> for $type:ty
-        where
-            LaneCount<$lanes2:ident>: SupportedLaneCount,
-        {
-            $(#[$attrs:meta])*
-            fn $fn:ident(&mut $self_tok:ident, $rhs_arg:ident: $rhs_arg_ty:ty) $body:tt
-        }
-    } => {
-        impl<const $lanes: usize> core::ops::$trait<$rhs> for $type
-        where
-            LaneCount<$lanes2>: SupportedLaneCount,
-        {
-            $(#[$attrs])*
-            fn $fn(&mut $self_tok, $rhs_arg: $rhs_arg_ty) $body
-        }
-
-        impl<const $lanes: usize> core::ops::$trait<&'_ $rhs> for $type
-        where
-            LaneCount<$lanes2>: SupportedLaneCount,
-        {
-            $(#[$attrs])*
-            fn $fn(&mut $self_tok, $rhs_arg: &$rhs_arg_ty) {
-                core::ops::$trait::$fn($self_tok, *$rhs_arg)
-            }
-        }
-    };
-
-    // unary op
-    {
-        impl<const $lanes:ident: usize> core::ops::$trait:ident for $type:ty
-        where
-            LaneCount<$lanes2:ident>: SupportedLaneCount,
-        {
-            type Output = $output:ty;
-            fn $fn:ident($self_tok:ident) -> Self::Output $body:tt
-        }
-    } => {
-        impl<const $lanes: usize> core::ops::$trait for $type
-        where
-            LaneCount<$lanes2>: SupportedLaneCount,
-        {
-            type Output = $output;
-            fn $fn($self_tok) -> Self::Output $body
-        }
-
-        impl<const $lanes: usize> core::ops::$trait for &'_ $type
-        where
-            LaneCount<$lanes2>: SupportedLaneCount,
-        {
-            type Output = <$type as core::ops::$trait>::Output;
-            fn $fn($self_tok) -> Self::Output {
-                core::ops::$trait::$fn(*$self_tok)
-            }
-        }
-    }
 }
 
 /// Automatically implements operators over vectors and scalars for a particular vector.
 macro_rules! impl_op {
     { impl Add for $scalar:ty } => {
-        impl_op! { @binary $scalar, Add::add, AddAssign::add_assign, simd_add }
+        impl_op! { @binary $scalar, Add::add, simd_add }
     };
     { impl Sub for $scalar:ty } => {
-        impl_op! { @binary $scalar, Sub::sub, SubAssign::sub_assign, simd_sub }
+        impl_op! { @binary $scalar, Sub::sub, simd_sub }
     };
     { impl Mul for $scalar:ty } => {
-        impl_op! { @binary $scalar, Mul::mul, MulAssign::mul_assign, simd_mul }
+        impl_op! { @binary $scalar, Mul::mul, simd_mul }
     };
     { impl Div for $scalar:ty } => {
-        impl_op! { @binary $scalar, Div::div, DivAssign::div_assign, simd_div }
+        impl_op! { @binary $scalar, Div::div, simd_div }
     };
     { impl Rem for $scalar:ty } => {
-        impl_op! { @binary $scalar, Rem::rem, RemAssign::rem_assign, simd_rem }
+        impl_op! { @binary $scalar, Rem::rem, simd_rem }
     };
     { impl Shl for $scalar:ty } => {
-        impl_op! { @binary $scalar, Shl::shl, ShlAssign::shl_assign, simd_shl }
+        impl_op! { @binary $scalar, Shl::shl, simd_shl }
     };
     { impl Shr for $scalar:ty } => {
-        impl_op! { @binary $scalar, Shr::shr, ShrAssign::shr_assign, simd_shr }
+        impl_op! { @binary $scalar, Shr::shr, simd_shr }
     };
     { impl BitAnd for $scalar:ty } => {
-        impl_op! { @binary $scalar, BitAnd::bitand, BitAndAssign::bitand_assign, simd_and }
+        impl_op! { @binary $scalar, BitAnd::bitand, simd_and }
     };
     { impl BitOr for $scalar:ty } => {
-        impl_op! { @binary $scalar, BitOr::bitor, BitOrAssign::bitor_assign, simd_or }
+        impl_op! { @binary $scalar, BitOr::bitor, simd_or }
     };
     { impl BitXor for $scalar:ty } => {
-        impl_op! { @binary $scalar, BitXor::bitxor, BitXorAssign::bitxor_assign, simd_xor }
-    };
-
-    { impl Not for $scalar:ty } => {
-        impl_ref_ops! {
-            impl<const LANES: usize> core::ops::Not for Simd<$scalar, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                type Output = Self;
-                fn not(self) -> Self::Output {
-                    self ^ Self::splat(!<$scalar>::default())
-                }
-            }
-        }
-    };
-
-    { impl Neg for $scalar:ty } => {
-        impl_ref_ops! {
-            impl<const LANES: usize> core::ops::Neg for Simd<$scalar, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                type Output = Self;
-                fn neg(self) -> Self::Output {
-                    unsafe { intrinsics::simd_neg(self) }
-                }
-            }
-        }
+        impl_op! { @binary $scalar, BitXor::bitxor, simd_xor }
     };
 
     // generic binary op with assignment when output is `Self`
-    { @binary $scalar:ty, $trait:ident :: $trait_fn:ident, $assign_trait:ident :: $assign_trait_fn:ident, $intrinsic:ident } => {
+    { @binary $scalar:ty, $trait:ident :: $trait_fn:ident, $intrinsic:ident } => {
         impl_ref_ops! {
             impl<const LANES: usize> core::ops::$trait<Self> for Simd<$scalar, LANES>
             where
@@ -232,60 +118,6 @@ macro_rules! impl_op {
                 }
             }
         }
-
-        impl_ref_ops! {
-            impl<const LANES: usize> core::ops::$trait<$scalar> for Simd<$scalar, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                type Output = Self;
-
-                #[inline]
-                fn $trait_fn(self, rhs: $scalar) -> Self::Output {
-                    core::ops::$trait::$trait_fn(self, Self::splat(rhs))
-                }
-            }
-        }
-
-        impl_ref_ops! {
-            impl<const LANES: usize> core::ops::$trait<Simd<$scalar, LANES>> for $scalar
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                type Output = Simd<$scalar, LANES>;
-
-                #[inline]
-                fn $trait_fn(self, rhs: Simd<$scalar, LANES>) -> Self::Output {
-                    core::ops::$trait::$trait_fn(Simd::splat(self), rhs)
-                }
-            }
-        }
-
-        impl_ref_ops! {
-            impl<const LANES: usize> core::ops::$assign_trait<Self> for Simd<$scalar, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                #[inline]
-                fn $assign_trait_fn(&mut self, rhs: Self) {
-                    unsafe {
-                        *self = intrinsics::$intrinsic(*self, rhs);
-                    }
-                }
-            }
-        }
-
-        impl_ref_ops! {
-            impl<const LANES: usize> core::ops::$assign_trait<$scalar> for Simd<$scalar, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                #[inline]
-                fn $assign_trait_fn(&mut self, rhs: $scalar) {
-                    core::ops::$assign_trait::$assign_trait_fn(self, Self::splat(rhs));
-                }
-            }
-        }
     };
 }
 
@@ -298,7 +130,6 @@ macro_rules! impl_float_ops {
             impl_op! { impl Mul for $scalar }
             impl_op! { impl Div for $scalar }
             impl_op! { impl Rem for $scalar }
-            impl_op! { impl Neg for $scalar }
         )*
     };
 }
@@ -313,7 +144,6 @@ macro_rules! impl_unsigned_int_ops {
             impl_op! { impl BitAnd for $scalar }
             impl_op! { impl BitOr  for $scalar }
             impl_op! { impl BitXor for $scalar }
-            impl_op! { impl Not for $scalar }
 
             // Integers panic on divide by 0
             impl_ref_ops! {
@@ -344,67 +174,6 @@ macro_rules! impl_unsigned_int_ops {
                 }
             }
 
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Div<$scalar> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Self;
-
-                    #[inline]
-                    fn div(self, rhs: $scalar) -> Self::Output {
-                        if rhs == 0 {
-                            panic!("attempt to divide by zero");
-                        }
-                        if <$scalar>::MIN != 0 &&
-                            self.as_array().iter().any(|x| *x == <$scalar>::MIN) &&
-                            rhs == -1 as _ {
-                                panic!("attempt to divide with overflow");
-                        }
-                        let rhs = Self::splat(rhs);
-                        unsafe { intrinsics::simd_div(self, rhs) }
-                    }
-                }
-            }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Div<Simd<$scalar, LANES>> for $scalar
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Simd<$scalar, LANES>;
-
-                    #[inline]
-                    fn div(self, rhs: Simd<$scalar, LANES>) -> Self::Output {
-                        Simd::splat(self) / rhs
-                    }
-                }
-            }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::DivAssign<Self> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    #[inline]
-                    fn div_assign(&mut self, rhs: Self) {
-                        *self = *self / rhs;
-                    }
-                }
-            }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::DivAssign<$scalar> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    #[inline]
-                    fn div_assign(&mut self, rhs: $scalar) {
-                        *self = *self / rhs;
-                    }
-                }
-            }
-
             // remainder panics on zero divisor
             impl_ref_ops! {
                 impl<const LANES: usize> core::ops::Rem<Self> for Simd<$scalar, LANES>
@@ -434,67 +203,6 @@ macro_rules! impl_unsigned_int_ops {
                 }
             }
 
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Rem<$scalar> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Self;
-
-                    #[inline]
-                    fn rem(self, rhs: $scalar) -> Self::Output {
-                        if rhs == 0 {
-                            panic!("attempt to calculate the remainder with a divisor of zero");
-                        }
-                        if <$scalar>::MIN != 0 &&
-                            self.as_array().iter().any(|x| *x == <$scalar>::MIN) &&
-                            rhs == -1 as _ {
-                                panic!("attempt to calculate the remainder with overflow");
-                        }
-                        let rhs = Self::splat(rhs);
-                        unsafe { intrinsics::simd_rem(self, rhs) }
-                    }
-                }
-            }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Rem<Simd<$scalar, LANES>> for $scalar
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Simd<$scalar, LANES>;
-
-                    #[inline]
-                    fn rem(self, rhs: Simd<$scalar, LANES>) -> Self::Output {
-                        Simd::splat(self) % rhs
-                    }
-                }
-            }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::RemAssign<Self> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    #[inline]
-                    fn rem_assign(&mut self, rhs: Self) {
-                        *self = *self % rhs;
-                    }
-                }
-            }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::RemAssign<$scalar> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    #[inline]
-                    fn rem_assign(&mut self, rhs: $scalar) {
-                        *self = *self % rhs;
-                    }
-                }
-            }
-
             // shifts panic on overflow
             impl_ref_ops! {
                 impl<const LANES: usize> core::ops::Shl<Self> for Simd<$scalar, LANES>
@@ -518,49 +226,6 @@ macro_rules! impl_unsigned_int_ops {
                 }
             }
 
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Shl<$scalar> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Self;
-
-                    #[inline]
-                    fn shl(self, rhs: $scalar) -> Self::Output {
-                        if invalid_shift_rhs(rhs) {
-                            panic!("attempt to shift left with overflow");
-                        }
-                        let rhs = Self::splat(rhs);
-                        unsafe { intrinsics::simd_shl(self, rhs) }
-                    }
-                }
-            }
-
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::ShlAssign<Self> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    #[inline]
-                    fn shl_assign(&mut self, rhs: Self) {
-                        *self = *self << rhs;
-                    }
-                }
-            }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::ShlAssign<$scalar> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    #[inline]
-                    fn shl_assign(&mut self, rhs: $scalar) {
-                        *self = *self << rhs;
-                    }
-                }
-            }
-
             impl_ref_ops! {
                 impl<const LANES: usize> core::ops::Shr<Self> for Simd<$scalar, LANES>
                 where
@@ -582,49 +247,6 @@ macro_rules! impl_unsigned_int_ops {
                     }
                 }
             }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Shr<$scalar> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Self;
-
-                    #[inline]
-                    fn shr(self, rhs: $scalar) -> Self::Output {
-                        if invalid_shift_rhs(rhs) {
-                            panic!("attempt to shift with overflow");
-                        }
-                        let rhs = Self::splat(rhs);
-                        unsafe { intrinsics::simd_shr(self, rhs) }
-                    }
-                }
-            }
-
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::ShrAssign<Self> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    #[inline]
-                    fn shr_assign(&mut self, rhs: Self) {
-                        *self = *self >> rhs;
-                    }
-                }
-            }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::ShrAssign<$scalar> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    #[inline]
-                    fn shr_assign(&mut self, rhs: $scalar) {
-                        *self = *self >> rhs;
-                    }
-                }
-            }
         )*
     };
 }
@@ -633,9 +255,6 @@ macro_rules! impl_unsigned_int_ops {
 macro_rules! impl_signed_int_ops {
     { $($scalar:ty),* } => {
         impl_unsigned_int_ops! { $($scalar),* }
-        $( // scalar
-            impl_op! { impl Neg for $scalar }
-        )*
     };
 }
 
diff --git a/crates/core_simd/src/ops/assign.rs b/crates/core_simd/src/ops/assign.rs
new file mode 100644
index 00000000000..d2b48614fc9
--- /dev/null
+++ b/crates/core_simd/src/ops/assign.rs
@@ -0,0 +1,124 @@
+//! Assignment operators
+use super::*;
+use core::ops::{AddAssign, MulAssign}; // commutative binary op-assignment
+use core::ops::{BitAndAssign, BitOrAssign, BitXorAssign}; // commutative bit binary op-assignment
+use core::ops::{DivAssign, RemAssign, SubAssign}; // non-commutative binary op-assignment
+use core::ops::{ShlAssign, ShrAssign}; // non-commutative bit binary op-assignment
+
+// Arithmetic
+
+macro_rules! assign_ops {
+    ($(impl<T, U, const LANES: usize> $assignTrait:ident<U> for Simd<T, LANES>
+        where
+            Self: $trait:ident,
+        {
+            fn $assign_call:ident(rhs: U) {
+                $call:ident
+            }
+        })*) => {
+        $(impl<T, U, const LANES: usize> $assignTrait<U> for Simd<T, LANES>
+        where
+            Self: $trait<U, Output = Self>,
+            T: SimdElement,
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            #[inline]
+            fn $assign_call(&mut self, rhs: U) {
+                *self = self.$call(rhs);
+            }
+        })*
+    }
+}
+
+assign_ops! {
+    // Arithmetic
+    impl<T, U, const LANES: usize> AddAssign<U> for Simd<T, LANES>
+    where
+        Self: Add,
+    {
+        fn add_assign(rhs: U) {
+            add
+        }
+    }
+
+    impl<T, U, const LANES: usize> MulAssign<U> for Simd<T, LANES>
+    where
+        Self: Mul,
+    {
+        fn mul_assign(rhs: U) {
+            mul
+        }
+    }
+
+    impl<T, U, const LANES: usize> SubAssign<U> for Simd<T, LANES>
+    where
+        Self: Sub,
+    {
+        fn sub_assign(rhs: U) {
+            sub
+        }
+    }
+
+    impl<T, U, const LANES: usize> DivAssign<U> for Simd<T, LANES>
+    where
+        Self: Div,
+    {
+        fn div_assign(rhs: U) {
+            div
+        }
+    }
+    impl<T, U, const LANES: usize> RemAssign<U> for Simd<T, LANES>
+    where
+        Self: Rem,
+    {
+        fn rem_assign(rhs: U) {
+            rem
+        }
+    }
+
+    // Bitops
+    impl<T, U, const LANES: usize> BitAndAssign<U> for Simd<T, LANES>
+    where
+        Self: BitAnd,
+    {
+        fn bitand_assign(rhs: U) {
+            bitand
+        }
+    }
+
+    impl<T, U, const LANES: usize> BitOrAssign<U> for Simd<T, LANES>
+    where
+        Self: BitOr,
+    {
+        fn bitor_assign(rhs: U) {
+            bitor
+        }
+    }
+
+    impl<T, U, const LANES: usize> BitXorAssign<U> for Simd<T, LANES>
+    where
+        Self: BitXor,
+    {
+        fn bitxor_assign(rhs: U) {
+            bitxor
+        }
+    }
+
+    impl<T, U, const LANES: usize> ShlAssign<U> for Simd<T, LANES>
+    where
+        Self: Shl,
+    {
+        fn shl_assign(rhs: U) {
+            shl
+        }
+    }
+
+    impl<T, U, const LANES: usize> ShrAssign<U> for Simd<T, LANES>
+    where
+        Self: Shr,
+    {
+        fn shr_assign(rhs: U) {
+            shr
+        }
+    }
+}
diff --git a/crates/core_simd/src/ops/deref.rs b/crates/core_simd/src/ops/deref.rs
new file mode 100644
index 00000000000..9883a74c92d
--- /dev/null
+++ b/crates/core_simd/src/ops/deref.rs
@@ -0,0 +1,124 @@
+//! This module hacks in "implicit deref" for Simd's operators.
+//! Ideally, Rust would take care of this itself,
+//! and method calls usually handle the LHS implicitly.
+//! But this is not the case with arithmetic ops.
+use super::*;
+
+macro_rules! deref_lhs {
+    (impl<T, const LANES: usize> $trait:ident for $simd:ty {
+            fn $call:ident
+        }) => {
+        impl<T, const LANES: usize> $trait<$simd> for &$simd
+        where
+            T: SimdElement,
+            $simd: $trait<$simd, Output = $simd>,
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            type Output = Simd<T, LANES>;
+
+            #[inline]
+            #[must_use = "operator returns a new vector without mutating the inputs"]
+            fn $call(self, rhs: $simd) -> Self::Output {
+                (*self).$call(rhs)
+            }
+        }
+    };
+}
+
+macro_rules! deref_rhs {
+    (impl<T, const LANES: usize> $trait:ident for $simd:ty {
+            fn $call:ident
+        }) => {
+        impl<T, const LANES: usize> $trait<&$simd> for $simd
+        where
+            T: SimdElement,
+            $simd: $trait<$simd, Output = $simd>,
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            type Output = Simd<T, LANES>;
+
+            #[inline]
+            #[must_use = "operator returns a new vector without mutating the inputs"]
+            fn $call(self, rhs: &$simd) -> Self::Output {
+                self.$call(*rhs)
+            }
+        }
+    };
+}
+
+macro_rules! deref_ops {
+    ($(impl<T, const LANES: usize> $trait:ident for $simd:ty {
+            fn $call:ident
+        })*) => {
+        $(
+            deref_rhs! {
+                impl<T, const LANES: usize> $trait for $simd {
+                    fn $call
+                }
+            }
+            deref_lhs! {
+                impl<T, const LANES: usize> $trait for $simd {
+                    fn $call
+                }
+            }
+            impl<'lhs, 'rhs, T, const LANES: usize> $trait<&'rhs $simd> for &'lhs $simd
+            where
+                T: SimdElement,
+                $simd: $trait<$simd, Output = $simd>,
+                LaneCount<LANES>: SupportedLaneCount,
+            {
+                type Output = $simd;
+
+                #[inline]
+                #[must_use = "operator returns a new vector without mutating the inputs"]
+                fn $call(self, rhs: &$simd) -> Self::Output {
+                    (*self).$call(*rhs)
+                }
+            }
+        )*
+    }
+}
+
+deref_ops! {
+    // Arithmetic
+    impl<T, const LANES: usize> Add for Simd<T, LANES> {
+        fn add
+    }
+
+    impl<T, const LANES: usize> Mul for Simd<T, LANES> {
+        fn mul
+    }
+
+    impl<T, const LANES: usize> Sub for Simd<T, LANES> {
+        fn sub
+    }
+
+    impl<T, const LANES: usize> Div for Simd<T, LANES> {
+        fn div
+    }
+
+    impl<T, const LANES: usize> Rem for Simd<T, LANES> {
+        fn rem
+    }
+
+    // Bitops
+    impl<T, const LANES: usize> BitAnd for Simd<T, LANES> {
+        fn bitand
+    }
+
+    impl<T, const LANES: usize> BitOr for Simd<T, LANES> {
+        fn bitor
+    }
+
+    impl<T, const LANES: usize> BitXor for Simd<T, LANES> {
+        fn bitxor
+    }
+
+    impl<T, const LANES: usize> Shl for Simd<T, LANES> {
+        fn shl
+    }
+
+    impl<T, const LANES: usize> Shr for Simd<T, LANES> {
+        fn shr
+    }
+}
diff --git a/crates/core_simd/src/ops/unary.rs b/crates/core_simd/src/ops/unary.rs
new file mode 100644
index 00000000000..4ebea560fc6
--- /dev/null
+++ b/crates/core_simd/src/ops/unary.rs
@@ -0,0 +1,77 @@
+use crate::simd::intrinsics;
+use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+use core::ops::{Neg, Not}; // unary ops
+
+macro_rules! neg {
+    ($(impl<const LANES: usize> Neg for Simd<$scalar:ty, LANES>)*) => {
+        $(impl<const LANES: usize> Neg for Simd<$scalar, LANES>
+        where
+            $scalar: SimdElement,
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            type Output = Self;
+
+            #[inline]
+            #[must_use = "operator returns a new vector without mutating the input"]
+            fn neg(self) -> Self::Output {
+                unsafe { intrinsics::simd_neg(self) }
+            }
+        })*
+    }
+}
+
+neg! {
+    impl<const LANES: usize> Neg for Simd<f32, LANES>
+
+    impl<const LANES: usize> Neg for Simd<f64, LANES>
+
+    impl<const LANES: usize> Neg for Simd<i8, LANES>
+
+    impl<const LANES: usize> Neg for Simd<i16, LANES>
+
+    impl<const LANES: usize> Neg for Simd<i32, LANES>
+
+    impl<const LANES: usize> Neg for Simd<i64, LANES>
+
+    impl<const LANES: usize> Neg for Simd<isize, LANES>
+}
+
+macro_rules! not {
+    ($(impl<const LANES: usize> Not for Simd<$scalar:ty, LANES>)*) => {
+        $(impl<const LANES: usize> Not for Simd<$scalar, LANES>
+        where
+            $scalar: SimdElement,
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            type Output = Self;
+
+            #[inline]
+            #[must_use = "operator returns a new vector without mutating the input"]
+            fn not(self) -> Self::Output {
+                self ^ (Simd::splat(!(0 as $scalar)))
+            }
+        })*
+    }
+}
+
+not! {
+    impl<const LANES: usize> Not for Simd<i8, LANES>
+
+    impl<const LANES: usize> Not for Simd<i16, LANES>
+
+    impl<const LANES: usize> Not for Simd<i32, LANES>
+
+    impl<const LANES: usize> Not for Simd<i64, LANES>
+
+    impl<const LANES: usize> Not for Simd<isize, LANES>
+
+    impl<const LANES: usize> Not for Simd<u8, LANES>
+
+    impl<const LANES: usize> Not for Simd<u16, LANES>
+
+    impl<const LANES: usize> Not for Simd<u32, LANES>
+
+    impl<const LANES: usize> Not for Simd<u64, LANES>
+
+    impl<const LANES: usize> Not for Simd<usize, LANES>
+}
diff --git a/crates/core_simd/src/reduction.rs b/crates/core_simd/src/reduction.rs
index db0640aae79..e79a185816b 100644
--- a/crates/core_simd/src/reduction.rs
+++ b/crates/core_simd/src/reduction.rs
@@ -2,7 +2,8 @@ use crate::simd::intrinsics::{
     simd_reduce_add_ordered, simd_reduce_and, simd_reduce_max, simd_reduce_min,
     simd_reduce_mul_ordered, simd_reduce_or, simd_reduce_xor,
 };
-use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+use core::ops::{BitAnd, BitOr, BitXor};
 
 macro_rules! impl_integer_reductions {
     { $scalar:ty } => {
@@ -22,27 +23,6 @@ macro_rules! impl_integer_reductions {
                 unsafe { simd_reduce_mul_ordered(self, 1) }
             }
 
-            /// Horizontal bitwise "and".  Returns the cumulative bitwise "and" across the lanes of
-            /// the vector.
-            #[inline]
-            pub fn horizontal_and(self) -> $scalar {
-                unsafe { simd_reduce_and(self) }
-            }
-
-            /// Horizontal bitwise "or".  Returns the cumulative bitwise "or" across the lanes of
-            /// the vector.
-            #[inline]
-            pub fn horizontal_or(self) -> $scalar {
-                unsafe { simd_reduce_or(self) }
-            }
-
-            /// Horizontal bitwise "xor".  Returns the cumulative bitwise "xor" across the lanes of
-            /// the vector.
-            #[inline]
-            pub fn horizontal_xor(self) -> $scalar {
-                unsafe { simd_reduce_xor(self) }
-            }
-
             /// Horizontal maximum.  Returns the maximum lane in the vector.
             #[inline]
             pub fn horizontal_max(self) -> $scalar {
@@ -121,3 +101,45 @@ macro_rules! impl_float_reductions {
 
 impl_float_reductions! { f32 }
 impl_float_reductions! { f64 }
+
+impl<T, const LANES: usize> Simd<T, LANES>
+where
+    Self: BitAnd<Self, Output = Self>,
+    T: SimdElement + BitAnd<T, Output = T>,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    /// Horizontal bitwise "and".  Returns the cumulative bitwise "and" across the lanes of
+    /// the vector.
+    #[inline]
+    pub fn horizontal_and(self) -> T {
+        unsafe { simd_reduce_and(self) }
+    }
+}
+
+impl<T, const LANES: usize> Simd<T, LANES>
+where
+    Self: BitOr<Self, Output = Self>,
+    T: SimdElement + BitOr<T, Output = T>,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    /// Horizontal bitwise "or".  Returns the cumulative bitwise "or" across the lanes of
+    /// the vector.
+    #[inline]
+    pub fn horizontal_or(self) -> T {
+        unsafe { simd_reduce_or(self) }
+    }
+}
+
+impl<T, const LANES: usize> Simd<T, LANES>
+where
+    Self: BitXor<Self, Output = Self>,
+    T: SimdElement + BitXor<T, Output = T>,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    /// Horizontal bitwise "xor".  Returns the cumulative bitwise "xor" across the lanes of
+    /// the vector.
+    #[inline]
+    pub fn horizontal_xor(self) -> T {
+        unsafe { simd_reduce_xor(self) }
+    }
+}
diff --git a/crates/core_simd/src/select.rs b/crates/core_simd/src/select.rs
index d976231a03a..5d696ebf76e 100644
--- a/crates/core_simd/src/select.rs
+++ b/crates/core_simd/src/select.rs
@@ -17,6 +17,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     fn select(mask: Mask<T::Mask, LANES>, true_values: Self, false_values: Self) -> Self {
         unsafe { intrinsics::simd_select(mask.to_int(), true_values, false_values) }
     }
@@ -35,6 +36,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     fn select(mask: Self, true_values: Self, false_values: Self) -> Self {
         mask & true_values | !mask & false_values
     }
@@ -80,6 +82,7 @@ where
     /// assert_eq!(c.to_array(), [true, false, true, false]);
     /// ```
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn select<S: Select<Self>>(self, true_values: S, false_values: S) -> S {
         S::select(self, true_values, false_values)
     }
diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 62cda68f0a9..bdc489774a5 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -87,6 +87,8 @@ pub trait Swizzle<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
     /// Create a new vector from the lanes of `vector`.
     ///
     /// Lane `i` of the output is `vector[Self::INDEX[i]]`.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     fn swizzle<T>(vector: Simd<T, INPUT_LANES>) -> Simd<T, OUTPUT_LANES>
     where
         T: SimdElement,
@@ -106,6 +108,8 @@ pub trait Swizzle2<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
     ///
     /// Lane `i` is `first[j]` when `Self::INDEX[i]` is `First(j)`, or `second[j]` when it is
     /// `Second(j)`.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     fn swizzle2<T>(
         first: Simd<T, INPUT_LANES>,
         second: Simd<T, INPUT_LANES>,
@@ -182,6 +186,7 @@ where
 {
     /// Reverse the order of the lanes in the vector.
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn reverse(self) -> Self {
         const fn reverse_index<const LANES: usize>() -> [usize; LANES] {
             let mut index = [0; LANES];
@@ -206,6 +211,7 @@ where
     /// while the last `LANES - OFFSET` elements move to the front. After calling `rotate_lanes_left`,
     /// the element previously in lane `OFFSET` will become the first element in the slice.
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn rotate_lanes_left<const OFFSET: usize>(self) -> Self {
         const fn rotate_index<const OFFSET: usize, const LANES: usize>() -> [usize; LANES] {
             let offset = OFFSET % LANES;
@@ -231,6 +237,7 @@ where
     /// the end while the last `OFFSET` elements move to the front. After calling `rotate_lanes_right`,
     /// the element previously at index `LANES - OFFSET` will become the first element in the slice.
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn rotate_lanes_right<const OFFSET: usize>(self) -> Self {
         const fn rotate_index<const OFFSET: usize, const LANES: usize>() -> [usize; LANES] {
             let offset = LANES - OFFSET % LANES;
@@ -273,6 +280,7 @@ where
     /// assert_eq!(y.to_array(), [2, 6, 3, 7]);
     /// ```
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn interleave(self, other: Self) -> (Self, Self) {
         const fn lo<const LANES: usize>() -> [Which; LANES] {
             let mut idx = [Which::First(0); LANES];
@@ -336,6 +344,7 @@ where
     /// assert_eq!(y.to_array(), [4, 5, 6, 7]);
     /// ```
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn deinterleave(self, other: Self) -> (Self, Self) {
         const fn even<const LANES: usize>() -> [Which; LANES] {
             let mut idx = [Which::First(0); LANES];
diff --git a/crates/core_simd/src/vector/float.rs b/crates/core_simd/src/vector/float.rs
index c09d0ac84d2..4a4b23238c4 100644
--- a/crates/core_simd/src/vector/float.rs
+++ b/crates/core_simd/src/vector/float.rs
@@ -15,6 +15,7 @@ macro_rules! impl_float_vector {
             /// Raw transmutation to an unsigned integer vector type with the
             /// same size and number of lanes.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn to_bits(self) -> Simd<$bits_ty, LANES> {
                 assert_eq!(core::mem::size_of::<Self>(), core::mem::size_of::<Simd<$bits_ty, LANES>>());
                 unsafe { core::mem::transmute_copy(&self) }
@@ -23,6 +24,7 @@ macro_rules! impl_float_vector {
             /// Raw transmutation from an unsigned integer vector type with the
             /// same size and number of lanes.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn from_bits(bits: Simd<$bits_ty, LANES>) -> Self {
                 assert_eq!(core::mem::size_of::<Self>(), core::mem::size_of::<Simd<$bits_ty, LANES>>());
                 unsafe { core::mem::transmute_copy(&bits) }
@@ -31,6 +33,7 @@ macro_rules! impl_float_vector {
             /// Produces a vector where every lane has the absolute value of the
             /// equivalently-indexed lane in `self`.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn abs(self) -> Self {
                 unsafe { intrinsics::simd_fabs(self) }
             }
@@ -44,6 +47,7 @@ macro_rules! impl_float_vector {
             /// hardware in mind.
             #[cfg(feature = "std")]
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn mul_add(self, a: Self, b: Self) -> Self {
                 unsafe { intrinsics::simd_fma(self, a, b) }
             }
@@ -51,6 +55,7 @@ macro_rules! impl_float_vector {
             /// Produces a vector where every lane has the square root value
             /// of the equivalently-indexed lane in `self`
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             #[cfg(feature = "std")]
             pub fn sqrt(self) -> Self {
                 unsafe { intrinsics::simd_fsqrt(self) }
@@ -58,12 +63,14 @@ macro_rules! impl_float_vector {
 
             /// Takes the reciprocal (inverse) of each lane, `1/x`.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn recip(self) -> Self {
                 Self::splat(1.0) / self
             }
 
             /// Converts each lane from radians to degrees.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn to_degrees(self) -> Self {
                 // to_degrees uses a special constant for better precision, so extract that constant
                 self * Self::splat(<$type>::to_degrees(1.))
@@ -71,6 +78,7 @@ macro_rules! impl_float_vector {
 
             /// Converts each lane from degrees to radians.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn to_radians(self) -> Self {
                 self * Self::splat(<$type>::to_radians(1.))
             }
@@ -78,6 +86,7 @@ macro_rules! impl_float_vector {
             /// Returns true for each lane if it has a positive sign, including
             /// `+0.0`, `NaN`s with positive sign bit and positive infinity.
             #[inline]
+            #[must_use = "method returns a new mask and does not mutate the original value"]
             pub fn is_sign_positive(self) -> Mask<$mask_ty, LANES> {
                 !self.is_sign_negative()
             }
@@ -85,6 +94,7 @@ macro_rules! impl_float_vector {
             /// Returns true for each lane if it has a negative sign, including
             /// `-0.0`, `NaN`s with negative sign bit and negative infinity.
             #[inline]
+            #[must_use = "method returns a new mask and does not mutate the original value"]
             pub fn is_sign_negative(self) -> Mask<$mask_ty, LANES> {
                 let sign_bits = self.to_bits() & Simd::splat((!0 >> 1) + 1);
                 sign_bits.lanes_gt(Simd::splat(0))
@@ -92,24 +102,28 @@ macro_rules! impl_float_vector {
 
             /// Returns true for each lane if its value is `NaN`.
             #[inline]
+            #[must_use = "method returns a new mask and does not mutate the original value"]
             pub fn is_nan(self) -> Mask<$mask_ty, LANES> {
                 self.lanes_ne(self)
             }
 
             /// Returns true for each lane if its value is positive infinity or negative infinity.
             #[inline]
+            #[must_use = "method returns a new mask and does not mutate the original value"]
             pub fn is_infinite(self) -> Mask<$mask_ty, LANES> {
                 self.abs().lanes_eq(Self::splat(<$type>::INFINITY))
             }
 
             /// Returns true for each lane if its value is neither infinite nor `NaN`.
             #[inline]
+            #[must_use = "method returns a new mask and does not mutate the original value"]
             pub fn is_finite(self) -> Mask<$mask_ty, LANES> {
                 self.abs().lanes_lt(Self::splat(<$type>::INFINITY))
             }
 
             /// Returns true for each lane if its value is subnormal.
             #[inline]
+            #[must_use = "method returns a new mask and does not mutate the original value"]
             pub fn is_subnormal(self) -> Mask<$mask_ty, LANES> {
                 self.abs().lanes_ne(Self::splat(0.0)) & (self.to_bits() & Self::splat(<$type>::INFINITY).to_bits()).lanes_eq(Simd::splat(0))
             }
@@ -117,6 +131,7 @@ macro_rules! impl_float_vector {
             /// Returns true for each lane if its value is neither neither zero, infinite,
             /// subnormal, or `NaN`.
             #[inline]
+            #[must_use = "method returns a new mask and does not mutate the original value"]
             pub fn is_normal(self) -> Mask<$mask_ty, LANES> {
                 !(self.abs().lanes_eq(Self::splat(0.0)) | self.is_nan() | self.is_subnormal() | self.is_infinite())
             }
@@ -127,6 +142,7 @@ macro_rules! impl_float_vector {
             /// * `-1.0` if the number is negative, `-0.0`, or `NEG_INFINITY`
             /// * `NAN` if the number is `NAN`
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn signum(self) -> Self {
                 self.is_nan().select(Self::splat(<$type>::NAN), Self::splat(1.0).copysign(self))
             }
@@ -135,6 +151,7 @@ macro_rules! impl_float_vector {
             ///
             /// If any lane is a `NAN`, then a `NAN` with the sign of `sign` is returned.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn copysign(self, sign: Self) -> Self {
                 let sign_bit = sign.to_bits() & Self::splat(-0.).to_bits();
                 let magnitude = self.to_bits() & !Self::splat(-0.).to_bits();
@@ -145,6 +162,7 @@ macro_rules! impl_float_vector {
             ///
             /// If one of the values is `NAN`, then the other value is returned.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn min(self, other: Self) -> Self {
                 // TODO consider using an intrinsic
                 self.is_nan().select(
@@ -157,6 +175,7 @@ macro_rules! impl_float_vector {
             ///
             /// If one of the values is `NAN`, then the other value is returned.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn max(self, other: Self) -> Self {
                 // TODO consider using an intrinsic
                 self.is_nan().select(
@@ -171,6 +190,7 @@ macro_rules! impl_float_vector {
             /// greater than `max`, and the corresponding lane in `min` if the lane is less
             /// than `min`.  Otherwise returns the lane in `self`.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn clamp(self, min: Self, max: Self) -> Self {
                 assert!(
                     min.lanes_le(max).all(),
diff --git a/crates/core_simd/src/vector/ptr.rs b/crates/core_simd/src/vector/ptr.rs
index ac9b98ca031..c668d9a6eae 100644
--- a/crates/core_simd/src/vector/ptr.rs
+++ b/crates/core_simd/src/vector/ptr.rs
@@ -23,7 +23,7 @@ where
     pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
         unsafe {
             let x: Simd<usize, LANES> = mem::transmute_copy(&self);
-            mem::transmute_copy(&{ x + (addend * mem::size_of::<T>()) })
+            mem::transmute_copy(&{ x + (addend * Simd::splat(mem::size_of::<T>())) })
         }
     }
 }
@@ -49,7 +49,7 @@ where
     pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
         unsafe {
             let x: Simd<usize, LANES> = mem::transmute_copy(&self);
-            mem::transmute_copy(&{ x + (addend * mem::size_of::<T>()) })
+            mem::transmute_copy(&{ x + (addend * Simd::splat(mem::size_of::<T>())) })
         }
     }
 }
diff --git a/crates/core_simd/src/vendor/x86.rs b/crates/core_simd/src/vendor/x86.rs
index d3c19ccc539..0dd47015ed2 100644
--- a/crates/core_simd/src/vendor/x86.rs
+++ b/crates/core_simd/src/vendor/x86.rs
@@ -8,10 +8,10 @@ use core::arch::x86_64::*;
 
 from_transmute! { unsafe u8x16 => __m128i }
 from_transmute! { unsafe u8x32 => __m256i }
-//from_transmute! { unsafe u8x64 => __m512i }
+from_transmute! { unsafe u8x64 => __m512i }
 from_transmute! { unsafe i8x16 => __m128i }
 from_transmute! { unsafe i8x32 => __m256i }
-//from_transmute! { unsafe i8x64 => __m512i }
+from_transmute! { unsafe i8x64 => __m512i }
 
 from_transmute! { unsafe u16x8 => __m128i }
 from_transmute! { unsafe u16x16 => __m256i }
diff --git a/crates/core_simd/tests/autoderef.rs b/crates/core_simd/tests/autoderef.rs
new file mode 100644
index 00000000000..9359da16ee5
--- /dev/null
+++ b/crates/core_simd/tests/autoderef.rs
@@ -0,0 +1,22 @@
+// Test that we handle all our "auto-deref" cases correctly.
+#![feature(portable_simd)]
+use core_simd::f32x4;
+
+#[cfg(target_arch = "wasm32")]
+use wasm_bindgen_test::*;
+
+#[cfg(target_arch = "wasm32")]
+wasm_bindgen_test_configure!(run_in_browser);
+
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn deref() {
+    let x = f32x4::splat(1.0);
+    let y = f32x4::splat(2.0);
+    let a = &x;
+    let b = &y;
+    assert_eq!(f32x4::splat(3.0), x + y);
+    assert_eq!(f32x4::splat(3.0), x + b);
+    assert_eq!(f32x4::splat(3.0), a + y);
+    assert_eq!(f32x4::splat(3.0), a + b);
+}
diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index 31b7ee20695..43ddde4c55e 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -38,22 +38,6 @@ macro_rules! impl_binary_op_test {
                     );
                 }
 
-                fn scalar_rhs<const LANES: usize>() {
-                    test_helpers::test_binary_scalar_rhs_elementwise(
-                        &<Simd<$scalar, LANES> as core::ops::$trait<$scalar>>::$fn,
-                        &$scalar_fn,
-                        &|_, _| true,
-                    );
-                }
-
-                fn scalar_lhs<const LANES: usize>() {
-                    test_helpers::test_binary_scalar_lhs_elementwise(
-                        &<$scalar as core::ops::$trait<Simd<$scalar, LANES>>>::$fn,
-                        &$scalar_fn,
-                        &|_, _| true,
-                    );
-                }
-
                 fn assign<const LANES: usize>() {
                     test_helpers::test_binary_elementwise(
                         &|mut a, b| { <Simd<$scalar, LANES> as core::ops::$trait_assign>::$fn_assign(&mut a, b); a },
@@ -61,14 +45,6 @@ macro_rules! impl_binary_op_test {
                         &|_, _| true,
                     );
                 }
-
-                fn assign_scalar_rhs<const LANES: usize>() {
-                    test_helpers::test_binary_scalar_rhs_elementwise(
-                        &|mut a, b| { <Simd<$scalar, LANES> as core::ops::$trait_assign<$scalar>>::$fn_assign(&mut a, b); a },
-                        &$scalar_fn,
-                        &|_, _| true,
-                    );
-                }
             }
         }
     };
@@ -99,22 +75,6 @@ macro_rules! impl_binary_checked_op_test {
                     );
                 }
 
-                fn scalar_rhs<const LANES: usize>() {
-                    test_helpers::test_binary_scalar_rhs_elementwise(
-                        &<Simd<$scalar, LANES> as core::ops::$trait<$scalar>>::$fn,
-                        &$scalar_fn,
-                        &|x, y| x.iter().all(|x| $check_fn(*x, y)),
-                    );
-                }
-
-                fn scalar_lhs<const LANES: usize>() {
-                    test_helpers::test_binary_scalar_lhs_elementwise(
-                        &<$scalar as core::ops::$trait<Simd<$scalar, LANES>>>::$fn,
-                        &$scalar_fn,
-                        &|x, y| y.iter().all(|y| $check_fn(x, *y)),
-                    );
-                }
-
                 fn assign<const LANES: usize>() {
                     test_helpers::test_binary_elementwise(
                         &|mut a, b| { <Simd<$scalar, LANES> as core::ops::$trait_assign>::$fn_assign(&mut a, b); a },
@@ -122,14 +82,6 @@ macro_rules! impl_binary_checked_op_test {
                         &|x, y| x.iter().zip(y.iter()).all(|(x, y)| $check_fn(*x, *y)),
                     )
                 }
-
-                fn assign_scalar_rhs<const LANES: usize>() {
-                    test_helpers::test_binary_scalar_rhs_elementwise(
-                        &|mut a, b| { <Simd<$scalar, LANES> as core::ops::$trait_assign<$scalar>>::$fn_assign(&mut a, b); a },
-                        &$scalar_fn,
-                        &|x, y| x.iter().all(|x| $check_fn(*x, y)),
-                    )
-                }
             }
         }
     };
diff --git a/crates/test_helpers/src/lib.rs b/crates/test_helpers/src/lib.rs
index 5c6478876f3..7edd6096381 100644
--- a/crates/test_helpers/src/lib.rs
+++ b/crates/test_helpers/src/lib.rs
@@ -376,6 +376,12 @@ macro_rules! test_lanes {
                 fn lanes_32() {
                     implementation::<32>();
                 }
+
+                #[test]
+                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+                fn lanes_64() {
+                    implementation::<64>();
+                }
             }
         )*
     }
@@ -431,6 +437,12 @@ macro_rules! test_lanes_panic {
                 fn lanes_32() {
                     implementation::<32>();
                 }
+
+                #[test]
+                #[should_panic]
+                fn lanes_64() {
+                    implementation::<64>();
+                }
             }
         )*
     }

From 35883a79158e549a3bf5bb20999dd87e448b6c2e Mon Sep 17 00:00:00 2001
From: Vadim Petrochenkov <vadim.petrochenkov@gmail.com>
Date: Fri, 17 Dec 2021 15:10:53 +0800
Subject: [PATCH 02/35] Merge commit '533f0fc81ab9ba097779fcd27c8f9ea12261fef5'
 into psimd

---
 crates/core_simd/src/masks/bitmask.rs    |  8 +--
 crates/core_simd/src/masks/full_masks.rs |  4 +-
 crates/core_simd/src/mod.rs              |  1 -
 crates/core_simd/src/select.rs           | 74 +++++++-----------------
 4 files changed, 25 insertions(+), 62 deletions(-)

diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index 4c964cb52e1..b4217dc87ba 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -105,18 +105,14 @@ where
     #[must_use = "method returns a new vector and does not mutate the original value"]
     pub fn to_int(self) -> Simd<T, LANES> {
         unsafe {
-            crate::intrinsics::simd_select_bitmask(
-                self.0,
-                Simd::splat(T::TRUE),
-                Simd::splat(T::FALSE),
-            )
+            intrinsics::simd_select_bitmask(self.0, Simd::splat(T::TRUE), Simd::splat(T::FALSE))
         }
     }
 
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub unsafe fn from_int_unchecked(value: Simd<T, LANES>) -> Self {
-        unsafe { Self(crate::intrinsics::simd_bitmask(value), PhantomData) }
+        unsafe { Self(intrinsics::simd_bitmask(value), PhantomData) }
     }
 
     #[cfg(feature = "generic_const_exprs")]
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index 5421ccbe3d8..e5bb784bb91 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -115,7 +115,7 @@ where
     pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
         unsafe {
             let mut bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN] =
-                crate::intrinsics::simd_bitmask(self.0);
+                intrinsics::simd_bitmask(self.0);
 
             // There is a bug where LLVM appears to implement this operation with the wrong
             // bit order.
@@ -144,7 +144,7 @@ where
                 }
             }
 
-            Self::from_int_unchecked(crate::intrinsics::simd_select_bitmask(
+            Self::from_int_unchecked(intrinsics::simd_select_bitmask(
                 bitmask,
                 Self::splat(true).to_int(),
                 Self::splat(false).to_int(),
diff --git a/crates/core_simd/src/mod.rs b/crates/core_simd/src/mod.rs
index ec874a22389..85026265956 100644
--- a/crates/core_simd/src/mod.rs
+++ b/crates/core_simd/src/mod.rs
@@ -27,7 +27,6 @@ pub mod simd {
 
     pub use crate::core_simd::lane_count::{LaneCount, SupportedLaneCount};
     pub use crate::core_simd::masks::*;
-    pub use crate::core_simd::select::Select;
     pub use crate::core_simd::swizzle::*;
     pub use crate::core_simd::vector::*;
 }
diff --git a/crates/core_simd/src/select.rs b/crates/core_simd/src/select.rs
index 5d696ebf76e..8d521057fbd 100644
--- a/crates/core_simd/src/select.rs
+++ b/crates/core_simd/src/select.rs
@@ -1,54 +1,6 @@
 use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Mask, MaskElement, Simd, SimdElement, SupportedLaneCount};
 
-mod sealed {
-    pub trait Sealed<Mask> {
-        fn select(mask: Mask, true_values: Self, false_values: Self) -> Self;
-    }
-}
-use sealed::Sealed;
-
-/// Supporting trait for vector `select` function
-pub trait Select<Mask>: Sealed<Mask> {}
-
-impl<T, const LANES: usize> Sealed<Mask<T::Mask, LANES>> for Simd<T, LANES>
-where
-    T: SimdElement,
-    LaneCount<LANES>: SupportedLaneCount,
-{
-    #[inline]
-    #[must_use = "method returns a new vector and does not mutate the original inputs"]
-    fn select(mask: Mask<T::Mask, LANES>, true_values: Self, false_values: Self) -> Self {
-        unsafe { intrinsics::simd_select(mask.to_int(), true_values, false_values) }
-    }
-}
-
-impl<T, const LANES: usize> Select<Mask<T::Mask, LANES>> for Simd<T, LANES>
-where
-    T: SimdElement,
-    LaneCount<LANES>: SupportedLaneCount,
-{
-}
-
-impl<T, const LANES: usize> Sealed<Self> for Mask<T, LANES>
-where
-    T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
-{
-    #[inline]
-    #[must_use = "method returns a new vector and does not mutate the original inputs"]
-    fn select(mask: Self, true_values: Self, false_values: Self) -> Self {
-        mask & true_values | !mask & false_values
-    }
-}
-
-impl<T, const LANES: usize> Select<Self> for Mask<T, LANES>
-where
-    T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
-{
-}
-
 impl<T, const LANES: usize> Mask<T, LANES>
 where
     T: MaskElement,
@@ -69,8 +21,24 @@ where
     /// let c = mask.select(a, b);
     /// assert_eq!(c.to_array(), [0, 5, 6, 3]);
     /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn select<U>(
+        self,
+        true_values: Simd<U, LANES>,
+        false_values: Simd<U, LANES>,
+    ) -> Simd<U, LANES>
+    where
+        U: SimdElement<Mask = T>,
+    {
+        unsafe { intrinsics::simd_select(self.to_int(), true_values, false_values) }
+    }
+
+    /// Choose lanes from two masks.
+    ///
+    /// For each lane in the mask, choose the corresponding lane from `true_values` if
+    /// that lane mask is true, and `false_values` if that lane mask is false.
     ///
-    /// `select` can also be used on masks:
     /// ```
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "std")] use core_simd::Mask;
@@ -78,12 +46,12 @@ where
     /// let a = Mask::<i32, 4>::from_array([true, true, false, false]);
     /// let b = Mask::<i32, 4>::from_array([false, false, true, true]);
     /// let mask = Mask::<i32, 4>::from_array([true, false, false, true]);
-    /// let c = mask.select(a, b);
+    /// let c = mask.select_mask(a, b);
     /// assert_eq!(c.to_array(), [true, false, true, false]);
     /// ```
     #[inline]
-    #[must_use = "method returns a new vector and does not mutate the original inputs"]
-    pub fn select<S: Select<Self>>(self, true_values: S, false_values: S) -> S {
-        S::select(self, true_values, false_values)
+    #[must_use = "method returns a new mask and does not mutate the original inputs"]
+    pub fn select_mask(self, true_values: Self, false_values: Self) -> Self {
+        self & true_values | !self & false_values
     }
 }

From efb20c2d20aeb8868909acef26814678f862bfd4 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Thu, 27 Jan 2022 11:23:40 -0800
Subject: [PATCH 03/35] Sync
 rust-lang/portable-simd@03f6fbb21e6050da2a05b3ce8f480c020b384916

---
 Cargo.toml                           |   1 +
 crates/core_simd/Cargo.toml          |   3 +
 crates/core_simd/examples/nbody.rs   |  10 +-
 crates/core_simd/src/intrinsics.rs   |  34 +--
 crates/core_simd/src/masks.rs        |  42 ++-
 crates/core_simd/src/ops.rs          | 394 +++++++++++++--------------
 crates/core_simd/src/round.rs        |  41 ---
 crates/core_simd/src/vector.rs       |  30 ++
 crates/core_simd/src/vector/float.rs |  39 +--
 crates/core_simd/tests/cast.rs       |  37 +++
 crates/core_simd/tests/ops_macros.rs |   2 +
 crates/core_simd/tests/round.rs      |   2 +
 crates/std_float/Cargo.toml          |  13 +
 crates/std_float/src/lib.rs          | 165 +++++++++++
 14 files changed, 486 insertions(+), 327 deletions(-)
 create mode 100644 crates/core_simd/tests/cast.rs
 create mode 100644 crates/std_float/Cargo.toml
 create mode 100644 crates/std_float/src/lib.rs

diff --git a/Cargo.toml b/Cargo.toml
index 3f1abd73519..9802386e456 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,5 +2,6 @@
 
 members = [
     "crates/core_simd",
+    "crates/std_float",
     "crates/test_helpers",
 ]
diff --git a/crates/core_simd/Cargo.toml b/crates/core_simd/Cargo.toml
index a103ef115a5..d2ff5f3b1b1 100644
--- a/crates/core_simd/Cargo.toml
+++ b/crates/core_simd/Cargo.toml
@@ -26,3 +26,6 @@ features = ["alloc"]
 
 [dev-dependencies.test_helpers]
 path = "../test_helpers"
+
+[dev-dependencies]
+std_float = { path = "../std_float/", features = ["as_crate"] }
diff --git a/crates/core_simd/examples/nbody.rs b/crates/core_simd/examples/nbody.rs
index 43280feebbd..7b1e6840f64 100644
--- a/crates/core_simd/examples/nbody.rs
+++ b/crates/core_simd/examples/nbody.rs
@@ -1,11 +1,13 @@
-#![cfg_attr(feature = "std", feature(portable_simd))]
+#![feature(portable_simd)]
+extern crate std_float;
 
 /// Benchmarks game nbody code
 /// Taken from the `packed_simd` crate
 /// Run this benchmark with `cargo test --example nbody`
-#[cfg(feature = "std")]
 mod nbody {
-    use core_simd::*;
+    use core_simd::simd::*;
+    #[allow(unused)] // False positive?
+    use std_float::StdFloat;
 
     use std::f64::consts::PI;
     const SOLAR_MASS: f64 = 4.0 * PI * PI;
@@ -167,7 +169,6 @@ mod nbody {
     }
 }
 
-#[cfg(feature = "std")]
 #[cfg(test)]
 mod tests {
     // Good enough for demonstration purposes, not going for strictness here.
@@ -184,7 +185,6 @@ mod tests {
 }
 
 fn main() {
-    #[cfg(feature = "std")]
     {
         let (energy_before, energy_after) = nbody::run(1000);
         println!("Energy before: {}", energy_before);
diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 6a6d26d10a7..233657202f7 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -39,6 +39,10 @@ extern "platform-intrinsic" {
 
     /// fptoui/fptosi/uitofp/sitofp
     pub(crate) fn simd_cast<T, U>(x: T) -> U;
+    /// follows Rust's `T as U` semantics, including saturating float casts
+    /// which amounts to the same as `simd_cast` for many cases
+    #[cfg(not(bootstrap))]
+    pub(crate) fn simd_as<T, U>(x: T) -> U;
 
     /// neg/fneg
     pub(crate) fn simd_neg<T>(x: T) -> T;
@@ -46,6 +50,10 @@ extern "platform-intrinsic" {
     /// fabs
     pub(crate) fn simd_fabs<T>(x: T) -> T;
 
+    // minnum/maxnum
+    pub(crate) fn simd_fmin<T>(x: T, y: T) -> T;
+    pub(crate) fn simd_fmax<T>(x: T, y: T) -> T;
+
     pub(crate) fn simd_eq<T, U>(x: T, y: T) -> U;
     pub(crate) fn simd_ne<T, U>(x: T, y: T) -> U;
     pub(crate) fn simd_lt<T, U>(x: T, y: T) -> U;
@@ -87,29 +95,3 @@ extern "platform-intrinsic" {
     #[allow(unused)]
     pub(crate) fn simd_select_bitmask<M, T>(m: M, a: T, b: T) -> T;
 }
-
-#[cfg(feature = "std")]
-mod std {
-    extern "platform-intrinsic" {
-        // ceil
-        pub(crate) fn simd_ceil<T>(x: T) -> T;
-
-        // floor
-        pub(crate) fn simd_floor<T>(x: T) -> T;
-
-        // round
-        pub(crate) fn simd_round<T>(x: T) -> T;
-
-        // trunc
-        pub(crate) fn simd_trunc<T>(x: T) -> T;
-
-        // fsqrt
-        pub(crate) fn simd_fsqrt<T>(x: T) -> T;
-
-        // fma
-        pub(crate) fn simd_fma<T>(x: T, y: T, z: T) -> T;
-    }
-}
-
-#[cfg(feature = "std")]
-pub(crate) use crate::simd::intrinsics::std::*;
diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index 191e9690313..ae1fef53da8 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -12,9 +12,10 @@
 )]
 mod mask_impl;
 
+use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
 use core::cmp::Ordering;
-use core::fmt;
+use core::{fmt, mem};
 
 mod sealed {
     use super::*;
@@ -105,22 +106,39 @@ where
         Self(mask_impl::Mask::splat(value))
     }
 
-    /// Converts an array to a SIMD vector.
+    /// Converts an array of bools to a SIMD mask.
     pub fn from_array(array: [bool; LANES]) -> Self {
-        let mut vector = Self::splat(false);
-        for (i, v) in array.iter().enumerate() {
-            vector.set(i, *v);
+        // SAFETY: Rust's bool has a layout of 1 byte (u8) with a value of
+        //     true:    0b_0000_0001
+        //     false:   0b_0000_0000
+        // Thus, an array of bools is also a valid array of bytes: [u8; N]
+        // This would be hypothetically valid as an "in-place" transmute,
+        // but these are "dependently-sized" types, so copy elision it is!
+        unsafe {
+            let bytes: [u8; LANES] = mem::transmute_copy(&array);
+            let bools: Simd<i8, LANES> =
+                intrinsics::simd_ne(Simd::from_array(bytes), Simd::splat(0u8));
+            Mask::from_int_unchecked(intrinsics::simd_cast(bools))
         }
-        vector
     }
 
-    /// Converts a SIMD vector to an array.
+    /// Converts a SIMD mask to an array of bools.
     pub fn to_array(self) -> [bool; LANES] {
-        let mut array = [false; LANES];
-        for (i, v) in array.iter_mut().enumerate() {
-            *v = self.test(i);
+        // This follows mostly the same logic as from_array.
+        // SAFETY: Rust's bool has a layout of 1 byte (u8) with a value of
+        //     true:    0b_0000_0001
+        //     false:   0b_0000_0000
+        // Thus, an array of bools is also a valid array of bytes: [u8; N]
+        // Since our masks are equal to integers where all bits are set,
+        // we can simply convert them to i8s, and then bitand them by the
+        // bitpattern for Rust's "true" bool.
+        // This would be hypothetically valid as an "in-place" transmute,
+        // but these are "dependently-sized" types, so copy elision it is!
+        unsafe {
+            let mut bytes: Simd<i8, LANES> = intrinsics::simd_cast(self.to_int());
+            bytes &= Simd::splat(1i8);
+            mem::transmute_copy(&bytes)
         }
-        array
     }
 
     /// Converts a vector of integers to a mask, where 0 represents `false` and -1
@@ -516,7 +534,7 @@ pub type mask16x8 = Mask<i16, 8>;
 pub type mask16x16 = Mask<i16, 16>;
 
 /// Vector of 32 16-bit masks
-pub type mask16x32 = Mask<i32, 32>;
+pub type mask16x32 = Mask<i16, 32>;
 
 /// Vector of two 32-bit masks
 pub type mask32x2 = Mask<i32, 2>;
diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index 3582c57870b..b65038933bf 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -1,4 +1,3 @@
-use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
 use core::ops::{Add, Mul};
 use core::ops::{BitAnd, BitOr, BitXor};
@@ -32,232 +31,211 @@ where
     }
 }
 
-/// Checks if the right-hand side argument of a left- or right-shift would cause overflow.
-fn invalid_shift_rhs<T>(rhs: T) -> bool
-where
-    T: Default + PartialOrd + core::convert::TryFrom<usize>,
-    <T as core::convert::TryFrom<usize>>::Error: core::fmt::Debug,
-{
-    let bits_in_type = T::try_from(8 * core::mem::size_of::<T>()).unwrap();
-    rhs < T::default() || rhs >= bits_in_type
+macro_rules! unsafe_base {
+    ($lhs:ident, $rhs:ident, {$simd_call:ident}, $($_:tt)*) => {
+        unsafe { $crate::simd::intrinsics::$simd_call($lhs, $rhs) }
+    };
 }
 
-/// Automatically implements operators over references in addition to the provided operator.
-macro_rules! impl_ref_ops {
-    // binary op
-    {
-        impl<const $lanes:ident: usize> core::ops::$trait:ident<$rhs:ty> for $type:ty
-        where
-            LaneCount<$lanes2:ident>: SupportedLaneCount,
+/// SAFETY: This macro should not be used for anything except Shl or Shr, and passed the appropriate shift intrinsic.
+/// It handles performing a bitand in addition to calling the shift operator, so that the result
+/// is well-defined: LLVM can return a poison value if you shl, lshr, or ashr if rhs >= <Int>::BITS
+/// At worst, this will maybe add another instruction and cycle,
+/// at best, it may open up more optimization opportunities,
+/// or simply be elided entirely, especially for SIMD ISAs which default to this.
+///
+// FIXME: Consider implementing this in cg_llvm instead?
+// cg_clif defaults to this, and scalar MIR shifts also default to wrapping
+macro_rules! wrap_bitshift {
+    ($lhs:ident, $rhs:ident, {$simd_call:ident}, $int:ident) => {
+        unsafe {
+            $crate::simd::intrinsics::$simd_call(
+                $lhs,
+                $rhs.bitand(Simd::splat(<$int>::BITS as $int - 1)),
+            )
+        }
+    };
+}
+
+// Division by zero is poison, according to LLVM.
+// So is dividing the MIN value of a signed integer by -1,
+// since that would return MAX + 1.
+// FIXME: Rust allows <SInt>::MIN / -1,
+// so we should probably figure out how to make that safe.
+macro_rules! int_divrem_guard {
+    (   $lhs:ident,
+        $rhs:ident,
+        {   const PANIC_ZERO: &'static str = $zero:literal;
+            const PANIC_OVERFLOW: &'static str = $overflow:literal;
+            $simd_call:ident
+        },
+        $int:ident ) => {
+        if $rhs.lanes_eq(Simd::splat(0)).any() {
+            panic!($zero);
+        } else if <$int>::MIN != 0
+            && ($lhs.lanes_eq(Simd::splat(<$int>::MIN))
+                // type inference can break here, so cut an SInt to size
+                & $rhs.lanes_eq(Simd::splat(-1i64 as _))).any()
         {
-            type Output = $output:ty;
-
-            $(#[$attrs:meta])*
-            fn $fn:ident($self_tok:ident, $rhs_arg:ident: $rhs_arg_ty:ty) -> Self::Output $body:tt
-        }
-    } => {
-        impl<const $lanes: usize> core::ops::$trait<$rhs> for $type
-        where
-            LaneCount<$lanes2>: SupportedLaneCount,
-        {
-            type Output = $output;
-
-            $(#[$attrs])*
-            fn $fn($self_tok, $rhs_arg: $rhs_arg_ty) -> Self::Output $body
+            panic!($overflow);
+        } else {
+            unsafe { $crate::simd::intrinsics::$simd_call($lhs, $rhs) }
         }
     };
 }
 
-/// Automatically implements operators over vectors and scalars for a particular vector.
-macro_rules! impl_op {
-    { impl Add for $scalar:ty } => {
-        impl_op! { @binary $scalar, Add::add, simd_add }
-    };
-    { impl Sub for $scalar:ty } => {
-        impl_op! { @binary $scalar, Sub::sub, simd_sub }
-    };
-    { impl Mul for $scalar:ty } => {
-        impl_op! { @binary $scalar, Mul::mul, simd_mul }
-    };
-    { impl Div for $scalar:ty } => {
-        impl_op! { @binary $scalar, Div::div, simd_div }
-    };
-    { impl Rem for $scalar:ty } => {
-        impl_op! { @binary $scalar, Rem::rem, simd_rem }
-    };
-    { impl Shl for $scalar:ty } => {
-        impl_op! { @binary $scalar, Shl::shl, simd_shl }
-    };
-    { impl Shr for $scalar:ty } => {
-        impl_op! { @binary $scalar, Shr::shr, simd_shr }
-    };
-    { impl BitAnd for $scalar:ty } => {
-        impl_op! { @binary $scalar, BitAnd::bitand, simd_and }
-    };
-    { impl BitOr for $scalar:ty } => {
-        impl_op! { @binary $scalar, BitOr::bitor, simd_or }
-    };
-    { impl BitXor for $scalar:ty } => {
-        impl_op! { @binary $scalar, BitXor::bitxor, simd_xor }
-    };
+macro_rules! for_base_types {
+    (   T = ($($scalar:ident),*);
+        type Lhs = Simd<T, N>;
+        type Rhs = Simd<T, N>;
+        type Output = $out:ty;
 
-    // generic binary op with assignment when output is `Self`
-    { @binary $scalar:ty, $trait:ident :: $trait_fn:ident, $intrinsic:ident } => {
-        impl_ref_ops! {
-            impl<const LANES: usize> core::ops::$trait<Self> for Simd<$scalar, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                type Output = Self;
+        impl $op:ident::$call:ident {
+            $macro_impl:ident $inner:tt
+        }) => {
+            $(
+                impl<const N: usize> $op<Self> for Simd<$scalar, N>
+                where
+                    $scalar: SimdElement,
+                    LaneCount<N>: SupportedLaneCount,
+                {
+                    type Output = $out;
 
-                #[inline]
-                fn $trait_fn(self, rhs: Self) -> Self::Output {
-                    unsafe {
-                        intrinsics::$intrinsic(self, rhs)
+                    #[inline]
+                    #[must_use = "operator returns a new vector without mutating the inputs"]
+                    fn $call(self, rhs: Self) -> Self::Output {
+                        $macro_impl!(self, rhs, $inner, $scalar)
                     }
-                }
-            }
+                })*
+    }
+}
+
+// A "TokenTree muncher": takes a set of scalar types `T = {};`
+// type parameters for the ops it implements, `Op::fn` names,
+// and a macro that expands into an expr, substituting in an intrinsic.
+// It passes that to for_base_types, which expands an impl for the types,
+// using the expanded expr in the function, and recurses with itself.
+//
+// tl;dr impls a set of ops::{Traits} for a set of types
+macro_rules! for_base_ops {
+    (
+        T = $types:tt;
+        type Lhs = Simd<T, N>;
+        type Rhs = Simd<T, N>;
+        type Output = $out:ident;
+        impl $op:ident::$call:ident
+            $inner:tt
+        $($rest:tt)*
+    ) => {
+        for_base_types! {
+            T = $types;
+            type Lhs = Simd<T, N>;
+            type Rhs = Simd<T, N>;
+            type Output = $out;
+            impl $op::$call
+                $inner
+        }
+        for_base_ops! {
+            T = $types;
+            type Lhs = Simd<T, N>;
+            type Rhs = Simd<T, N>;
+            type Output = $out;
+            $($rest)*
         }
     };
+    ($($done:tt)*) => {
+        // Done.
+    }
 }
 
-/// Implements floating-point operators for the provided types.
-macro_rules! impl_float_ops {
-    { $($scalar:ty),* } => {
-        $(
-            impl_op! { impl Add for $scalar }
-            impl_op! { impl Sub for $scalar }
-            impl_op! { impl Mul for $scalar }
-            impl_op! { impl Div for $scalar }
-            impl_op! { impl Rem for $scalar }
-        )*
-    };
+// Integers can always accept add, mul, sub, bitand, bitor, and bitxor.
+// For all of these operations, simd_* intrinsics apply wrapping logic.
+for_base_ops! {
+    T = (i8, i16, i32, i64, isize, u8, u16, u32, u64, usize);
+    type Lhs = Simd<T, N>;
+    type Rhs = Simd<T, N>;
+    type Output = Self;
+
+    impl Add::add {
+        unsafe_base { simd_add }
+    }
+
+    impl Mul::mul {
+        unsafe_base { simd_mul }
+    }
+
+    impl Sub::sub {
+        unsafe_base { simd_sub }
+    }
+
+    impl BitAnd::bitand {
+        unsafe_base { simd_and }
+    }
+
+    impl BitOr::bitor {
+        unsafe_base { simd_or }
+    }
+
+    impl BitXor::bitxor {
+        unsafe_base { simd_xor }
+    }
+
+    impl Div::div {
+        int_divrem_guard {
+            const PANIC_ZERO: &'static str = "attempt to divide by zero";
+            const PANIC_OVERFLOW: &'static str = "attempt to divide with overflow";
+            simd_div
+        }
+    }
+
+    impl Rem::rem {
+        int_divrem_guard {
+            const PANIC_ZERO: &'static str = "attempt to calculate the remainder with a divisor of zero";
+            const PANIC_OVERFLOW: &'static str = "attempt to calculate the remainder with overflow";
+            simd_rem
+        }
+    }
+
+    // The only question is how to handle shifts >= <Int>::BITS?
+    // Our current solution uses wrapping logic.
+    impl Shl::shl {
+        wrap_bitshift { simd_shl }
+    }
+
+    impl Shr::shr {
+        wrap_bitshift {
+            // This automatically monomorphizes to lshr or ashr, depending,
+            // so it's fine to use it for both UInts and SInts.
+            simd_shr
+        }
+    }
 }
 
-/// Implements unsigned integer operators for the provided types.
-macro_rules! impl_unsigned_int_ops {
-    { $($scalar:ty),* } => {
-        $(
-            impl_op! { impl Add for $scalar }
-            impl_op! { impl Sub for $scalar }
-            impl_op! { impl Mul for $scalar }
-            impl_op! { impl BitAnd for $scalar }
-            impl_op! { impl BitOr  for $scalar }
-            impl_op! { impl BitXor for $scalar }
+// We don't need any special precautions here:
+// Floats always accept arithmetic ops, but may become NaN.
+for_base_ops! {
+    T = (f32, f64);
+    type Lhs = Simd<T, N>;
+    type Rhs = Simd<T, N>;
+    type Output = Self;
 
-            // Integers panic on divide by 0
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Div<Self> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Self;
+    impl Add::add {
+        unsafe_base { simd_add }
+    }
 
-                    #[inline]
-                    fn div(self, rhs: Self) -> Self::Output {
-                        if rhs.as_array()
-                            .iter()
-                            .any(|x| *x == 0)
-                        {
-                            panic!("attempt to divide by zero");
-                        }
+    impl Mul::mul {
+        unsafe_base { simd_mul }
+    }
 
-                        // Guards for div(MIN, -1),
-                        // this check only applies to signed ints
-                        if <$scalar>::MIN != 0 && self.as_array().iter()
-                                .zip(rhs.as_array().iter())
-                                .any(|(x,y)| *x == <$scalar>::MIN && *y == -1 as _) {
-                            panic!("attempt to divide with overflow");
-                        }
-                        unsafe { intrinsics::simd_div(self, rhs) }
-                    }
-                }
-            }
+    impl Sub::sub {
+        unsafe_base { simd_sub }
+    }
 
-            // remainder panics on zero divisor
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Rem<Self> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Self;
+    impl Div::div {
+        unsafe_base { simd_div }
+    }
 
-                    #[inline]
-                    fn rem(self, rhs: Self) -> Self::Output {
-                        if rhs.as_array()
-                            .iter()
-                            .any(|x| *x == 0)
-                        {
-                            panic!("attempt to calculate the remainder with a divisor of zero");
-                        }
-
-                        // Guards for rem(MIN, -1)
-                        // this branch applies the check only to signed ints
-                        if <$scalar>::MIN != 0 && self.as_array().iter()
-                                .zip(rhs.as_array().iter())
-                                .any(|(x,y)| *x == <$scalar>::MIN && *y == -1 as _) {
-                            panic!("attempt to calculate the remainder with overflow");
-                        }
-                        unsafe { intrinsics::simd_rem(self, rhs) }
-                    }
-                }
-            }
-
-            // shifts panic on overflow
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Shl<Self> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Self;
-
-                    #[inline]
-                    fn shl(self, rhs: Self) -> Self::Output {
-                        // TODO there is probably a better way of doing this
-                        if rhs.as_array()
-                            .iter()
-                            .copied()
-                            .any(invalid_shift_rhs)
-                        {
-                            panic!("attempt to shift left with overflow");
-                        }
-                        unsafe { intrinsics::simd_shl(self, rhs) }
-                    }
-                }
-            }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Shr<Self> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Self;
-
-                    #[inline]
-                    fn shr(self, rhs: Self) -> Self::Output {
-                        // TODO there is probably a better way of doing this
-                        if rhs.as_array()
-                            .iter()
-                            .copied()
-                            .any(invalid_shift_rhs)
-                        {
-                            panic!("attempt to shift with overflow");
-                        }
-                        unsafe { intrinsics::simd_shr(self, rhs) }
-                    }
-                }
-            }
-        )*
-    };
+    impl Rem::rem {
+        unsafe_base { simd_rem }
+    }
 }
-
-/// Implements unsigned integer operators for the provided types.
-macro_rules! impl_signed_int_ops {
-    { $($scalar:ty),* } => {
-        impl_unsigned_int_ops! { $($scalar),* }
-    };
-}
-
-impl_unsigned_int_ops! { u8, u16, u32, u64, usize }
-impl_signed_int_ops! { i8, i16, i32, i64, isize }
-impl_float_ops! { f32, f64 }
diff --git a/crates/core_simd/src/round.rs b/crates/core_simd/src/round.rs
index 09789e11492..06ccab3ec49 100644
--- a/crates/core_simd/src/round.rs
+++ b/crates/core_simd/src/round.rs
@@ -5,47 +5,6 @@ macro_rules! implement {
     {
         $type:ty, $int_type:ty
     } => {
-        #[cfg(feature = "std")]
-        impl<const LANES: usize> Simd<$type, LANES>
-        where
-            LaneCount<LANES>: SupportedLaneCount,
-        {
-            /// Returns the smallest integer greater than or equal to each lane.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn ceil(self) -> Self {
-                unsafe { intrinsics::simd_ceil(self) }
-            }
-
-            /// Returns the largest integer value less than or equal to each lane.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn floor(self) -> Self {
-                unsafe { intrinsics::simd_floor(self) }
-            }
-
-            /// Rounds to the nearest integer value. Ties round toward zero.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn round(self) -> Self {
-                unsafe { intrinsics::simd_round(self) }
-            }
-
-            /// Returns the floating point's integer value, with its fractional part removed.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn trunc(self) -> Self {
-                unsafe { intrinsics::simd_trunc(self) }
-            }
-
-            /// Returns the floating point's fractional value, with its integer part removed.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn fract(self) -> Self {
-                self - self.trunc()
-            }
-        }
-
         impl<const LANES: usize> Simd<$type, LANES>
         where
             LaneCount<LANES>: SupportedLaneCount,
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 7c5ec2bc314..b7ef7a56c73 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -75,6 +75,36 @@ where
         Self(array)
     }
 
+    /// Performs lanewise conversion of a SIMD vector's elements to another SIMD-valid type.
+    /// This follows the semantics of Rust's `as` conversion for casting
+    /// integers to unsigned integers (interpreting as the other type, so `-1` to `MAX`),
+    /// and from floats to integers (truncating, or saturating at the limits) for each lane,
+    /// or vice versa.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "std")] use core_simd::Simd;
+    /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+    /// let floats: Simd<f32, 4> = Simd::from_array([1.9, -4.5, f32::INFINITY, f32::NAN]);
+    /// let ints = floats.cast::<i32>();
+    /// assert_eq!(ints, Simd::from_array([1, -4, i32::MAX, 0]));
+    ///
+    /// // Formally equivalent, but `Simd::cast` can optimize better.
+    /// assert_eq!(ints, Simd::from_array(floats.to_array().map(|x| x as i32)));
+    ///
+    /// // The float conversion does not round-trip.
+    /// let floats_again = ints.cast();
+    /// assert_ne!(floats, floats_again);
+    /// assert_eq!(floats_again, Simd::from_array([1.0, -4.0, 2147483647.0, 0.0]));
+    /// ```
+    #[must_use]
+    #[inline]
+    #[cfg(not(bootstrap))]
+    pub fn cast<U: SimdElement>(self) -> Simd<U, LANES> {
+        unsafe { intrinsics::simd_as(self) }
+    }
+
     /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
     /// If an index is out-of-bounds, the lane is instead selected from the `or` vector.
     ///
diff --git a/crates/core_simd/src/vector/float.rs b/crates/core_simd/src/vector/float.rs
index 4a4b23238c4..fcc7f6d8d1c 100644
--- a/crates/core_simd/src/vector/float.rs
+++ b/crates/core_simd/src/vector/float.rs
@@ -38,29 +38,6 @@ macro_rules! impl_float_vector {
                 unsafe { intrinsics::simd_fabs(self) }
             }
 
-            /// Fused multiply-add.  Computes `(self * a) + b` with only one rounding error,
-            /// yielding a more accurate result than an unfused multiply-add.
-            ///
-            /// Using `mul_add` *may* be more performant than an unfused multiply-add if the target
-            /// architecture has a dedicated `fma` CPU instruction.  However, this is not always
-            /// true, and will be heavily dependent on designing algorithms with specific target
-            /// hardware in mind.
-            #[cfg(feature = "std")]
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            pub fn mul_add(self, a: Self, b: Self) -> Self {
-                unsafe { intrinsics::simd_fma(self, a, b) }
-            }
-
-            /// Produces a vector where every lane has the square root value
-            /// of the equivalently-indexed lane in `self`
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[cfg(feature = "std")]
-            pub fn sqrt(self) -> Self {
-                unsafe { intrinsics::simd_fsqrt(self) }
-            }
-
             /// Takes the reciprocal (inverse) of each lane, `1/x`.
             #[inline]
             #[must_use = "method returns a new vector and does not mutate the original value"]
@@ -128,8 +105,8 @@ macro_rules! impl_float_vector {
                 self.abs().lanes_ne(Self::splat(0.0)) & (self.to_bits() & Self::splat(<$type>::INFINITY).to_bits()).lanes_eq(Simd::splat(0))
             }
 
-            /// Returns true for each lane if its value is neither neither zero, infinite,
-            /// subnormal, or `NaN`.
+            /// Returns true for each lane if its value is neither zero, infinite,
+            /// subnormal, nor `NaN`.
             #[inline]
             #[must_use = "method returns a new mask and does not mutate the original value"]
             pub fn is_normal(self) -> Mask<$mask_ty, LANES> {
@@ -164,11 +141,7 @@ macro_rules! impl_float_vector {
             #[inline]
             #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn min(self, other: Self) -> Self {
-                // TODO consider using an intrinsic
-                self.is_nan().select(
-                    other,
-                    self.lanes_ge(other).select(other, self)
-                )
+                unsafe { intrinsics::simd_fmin(self, other) }
             }
 
             /// Returns the maximum of each lane.
@@ -177,11 +150,7 @@ macro_rules! impl_float_vector {
             #[inline]
             #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn max(self, other: Self) -> Self {
-                // TODO consider using an intrinsic
-                self.is_nan().select(
-                    other,
-                    self.lanes_le(other).select(other, self)
-                )
+                unsafe { intrinsics::simd_fmax(self, other) }
             }
 
             /// Restrict each lane to a certain interval unless it is NaN.
diff --git a/crates/core_simd/tests/cast.rs b/crates/core_simd/tests/cast.rs
new file mode 100644
index 00000000000..ab5650f0713
--- /dev/null
+++ b/crates/core_simd/tests/cast.rs
@@ -0,0 +1,37 @@
+#![feature(portable_simd)]
+macro_rules! cast_types {
+    ($start:ident, $($target:ident),*) => {
+        mod $start {
+            use core_simd::simd::Simd;
+            type Vector<const N: usize> = Simd<$start, N>;
+            $(
+                mod $target {
+                    use super::*;
+                    test_helpers::test_lanes! {
+                        fn cast_as<const N: usize>() {
+                            test_helpers::test_unary_elementwise(
+                                &Vector::<N>::cast::<$target>,
+                                &|x| x as $target,
+                                &|_| true,
+                            )
+                        }
+                    }
+                }
+            )*
+        }
+    };
+}
+
+// The hypothesis is that widening conversions aren't terribly interesting.
+cast_types!(f32, f64, i8, u8, usize, isize);
+cast_types!(f64, f32, i8, u8, usize, isize);
+cast_types!(i8, u8, f32);
+cast_types!(u8, i8, f32);
+cast_types!(i16, u16, i8, u8, f32);
+cast_types!(u16, i16, i8, u8, f32);
+cast_types!(i32, u32, i8, u8, f32, f64);
+cast_types!(u32, i32, i8, u8, f32, f64);
+cast_types!(i64, u64, i8, u8, isize, usize, f32, f64);
+cast_types!(u64, i64, i8, u8, isize, usize, f32, f64);
+cast_types!(isize, usize, i8, u8, f32, f64);
+cast_types!(usize, isize, i8, u8, f32, f64);
diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index 43ddde4c55e..4fb9de198ee 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -546,6 +546,8 @@ macro_rules! impl_float_tests {
 
             #[cfg(feature = "std")]
             mod std {
+                use std_float::StdFloat;
+
                 use super::*;
                 test_helpers::test_lanes! {
                     fn sqrt<const LANES: usize>() {
diff --git a/crates/core_simd/tests/round.rs b/crates/core_simd/tests/round.rs
index 11d617a6c2c..1a1bc9ebca7 100644
--- a/crates/core_simd/tests/round.rs
+++ b/crates/core_simd/tests/round.rs
@@ -3,6 +3,8 @@
 macro_rules! float_rounding_test {
     { $scalar:tt, $int_scalar:tt } => {
         mod $scalar {
+            use std_float::StdFloat;
+
             type Vector<const LANES: usize> = core_simd::Simd<$scalar, LANES>;
             type Scalar = $scalar;
             type IntScalar = $int_scalar;
diff --git a/crates/std_float/Cargo.toml b/crates/std_float/Cargo.toml
new file mode 100644
index 00000000000..82f66b8dcb7
--- /dev/null
+++ b/crates/std_float/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "std_float"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+core_simd = { path = "../core_simd" }
+
+[features]
+default = ["as_crate"]
+as_crate = []
diff --git a/crates/std_float/src/lib.rs b/crates/std_float/src/lib.rs
new file mode 100644
index 00000000000..4bd4d4c05e3
--- /dev/null
+++ b/crates/std_float/src/lib.rs
@@ -0,0 +1,165 @@
+#![cfg_attr(feature = "as_crate", no_std)] // We are std!
+#![cfg_attr(
+    feature = "as_crate",
+    feature(platform_intrinsics),
+    feature(portable_simd)
+)]
+#[cfg(not(feature = "as_crate"))]
+use core::simd;
+#[cfg(feature = "as_crate")]
+use core_simd::simd;
+
+use simd::{LaneCount, Simd, SupportedLaneCount};
+
+#[cfg(feature = "as_crate")]
+mod experimental {
+    pub trait Sealed {}
+}
+
+#[cfg(feature = "as_crate")]
+use experimental as sealed;
+
+use crate::sealed::Sealed;
+
+// "platform intrinsics" are essentially "codegen intrinsics"
+// each of these may be scalarized and lowered to a libm call
+extern "platform-intrinsic" {
+    // ceil
+    fn simd_ceil<T>(x: T) -> T;
+
+    // floor
+    fn simd_floor<T>(x: T) -> T;
+
+    // round
+    fn simd_round<T>(x: T) -> T;
+
+    // trunc
+    fn simd_trunc<T>(x: T) -> T;
+
+    // fsqrt
+    fn simd_fsqrt<T>(x: T) -> T;
+
+    // fma
+    fn simd_fma<T>(x: T, y: T, z: T) -> T;
+}
+
+/// This trait provides a possibly-temporary implementation of float functions
+/// that may, in the absence of hardware support, canonicalize to calling an
+/// operating system's `math.h` dynamically-loaded library (also known as a
+/// shared object). As these conditionally require runtime support, they
+/// should only appear in binaries built assuming OS support: `std`.
+///
+/// However, there is no reason SIMD types, in general, need OS support,
+/// as for many architectures an embedded binary may simply configure that
+/// support itself. This means these types must be visible in `core`
+/// but have these functions available in `std`.
+///
+/// [`f32`] and [`f64`] achieve a similar trick by using "lang items", but
+/// due to compiler limitations, it is harder to implement this approach for
+/// abstract data types like [`Simd`]. From that need, this trait is born.
+///
+/// It is possible this trait will be replaced in some manner in the future,
+/// when either the compiler or its supporting runtime functions are improved.
+/// For now this trait is available to permit experimentation with SIMD float
+/// operations that may lack hardware support, such as `mul_add`.
+pub trait StdFloat: Sealed + Sized {
+    /// Fused multiply-add.  Computes `(self * a) + b` with only one rounding error,
+    /// yielding a more accurate result than an unfused multiply-add.
+    ///
+    /// Using `mul_add` *may* be more performant than an unfused multiply-add if the target
+    /// architecture has a dedicated `fma` CPU instruction.  However, this is not always
+    /// true, and will be heavily dependent on designing algorithms with specific target
+    /// hardware in mind.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn mul_add(self, a: Self, b: Self) -> Self {
+        unsafe { simd_fma(self, a, b) }
+    }
+
+    /// Produces a vector where every lane has the square root value
+    /// of the equivalently-indexed lane in `self`
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn sqrt(self) -> Self {
+        unsafe { simd_fsqrt(self) }
+    }
+
+    /// Returns the smallest integer greater than or equal to each lane.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn ceil(self) -> Self {
+        unsafe { simd_ceil(self) }
+    }
+
+    /// Returns the largest integer value less than or equal to each lane.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn floor(self) -> Self {
+        unsafe { simd_floor(self) }
+    }
+
+    /// Rounds to the nearest integer value. Ties round toward zero.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn round(self) -> Self {
+        unsafe { simd_round(self) }
+    }
+
+    /// Returns the floating point's integer value, with its fractional part removed.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn trunc(self) -> Self {
+        unsafe { simd_trunc(self) }
+    }
+
+    /// Returns the floating point's fractional value, with its integer part removed.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn fract(self) -> Self;
+}
+
+impl<const N: usize> Sealed for Simd<f32, N> where LaneCount<N>: SupportedLaneCount {}
+impl<const N: usize> Sealed for Simd<f64, N> where LaneCount<N>: SupportedLaneCount {}
+
+// We can safely just use all the defaults.
+impl<const N: usize> StdFloat for Simd<f32, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
+    /// Returns the floating point's fractional value, with its integer part removed.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn fract(self) -> Self {
+        self - self.trunc()
+    }
+}
+
+impl<const N: usize> StdFloat for Simd<f64, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
+    /// Returns the floating point's fractional value, with its integer part removed.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn fract(self) -> Self {
+        self - self.trunc()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use simd::*;
+
+    #[test]
+    fn everything_works() {
+        let x = f32x4::from_array([0.1, 0.5, 0.6, -1.5]);
+        let x2 = x + x;
+        let _xc = x.ceil();
+        let _xf = x.floor();
+        let _xr = x.round();
+        let _xt = x.trunc();
+        let _xfma = x.mul_add(x, x);
+        let _xsqrt = x.sqrt();
+        let _ = x2.abs() * x2;
+    }
+}

From 8adbb998401bd0ef2e700108aaac4e9ce4b89f6b Mon Sep 17 00:00:00 2001
From: Mark Rousskov <mark.simulacrum@gmail.com>
Date: Wed, 23 Feb 2022 08:06:22 -0500
Subject: [PATCH 04/35] Switch bootstrap cfgs

---
 crates/core_simd/src/intrinsics.rs | 1 -
 crates/core_simd/src/vector.rs     | 6 +-----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 233657202f7..2291400537c 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -41,7 +41,6 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_cast<T, U>(x: T) -> U;
     /// follows Rust's `T as U` semantics, including saturating float casts
     /// which amounts to the same as `simd_cast` for many cases
-    #[cfg(not(bootstrap))]
     pub(crate) fn simd_as<T, U>(x: T) -> U;
 
     /// neg/fneg
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index b7ef7a56c73..35c5b6b84f8 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -62,10 +62,7 @@ where
     /// `from_slice` will panic if the slice's `len` is less than the vector's `Simd::LANES`.
     #[must_use]
     pub const fn from_slice(slice: &[T]) -> Self {
-        assert!(
-            slice.len() >= LANES,
-            "slice length must be at least the number of lanes"
-        );
+        assert!(slice.len() >= LANES, "slice length must be at least the number of lanes");
         let mut array = [slice[0]; LANES];
         let mut i = 0;
         while i < LANES {
@@ -100,7 +97,6 @@ where
     /// ```
     #[must_use]
     #[inline]
-    #[cfg(not(bootstrap))]
     pub fn cast<U: SimdElement>(self) -> Simd<U, LANES> {
         unsafe { intrinsics::simd_as(self) }
     }

From 754e077e3262bcc5d637778e6773c72e74a9a7f2 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Mon, 28 Feb 2022 10:17:40 -0800
Subject: [PATCH 05/35] Sync
 rust-lang/portable-simd@5f49d4c8435a25d804b2f375e949cb25479f5be9

---
 crates/core_simd/examples/spectral_norm.rs | 77 +++++++++++++++++++
 crates/core_simd/src/comparisons.rs        | 12 +++
 crates/core_simd/src/intrinsics.rs         | 68 ++++++++++++++---
 crates/core_simd/src/lib.rs                |  2 +
 crates/core_simd/src/masks.rs              | 31 ++++----
 crates/core_simd/src/masks/bitmask.rs      | 23 +++---
 crates/core_simd/src/masks/full_masks.rs   | 74 +++++++++++-------
 crates/core_simd/src/masks/to_bitmask.rs   | 57 ++++++++++++++
 crates/core_simd/src/math.rs               |  4 +
 crates/core_simd/src/ops.rs                | 39 ++++++----
 crates/core_simd/src/reduction.rs          |  8 ++
 crates/core_simd/src/round.rs              | 27 ++++---
 crates/core_simd/src/select.rs             |  4 +
 crates/core_simd/src/swizzle.rs            |  2 +
 crates/core_simd/src/to_bytes.rs           |  2 +
 crates/core_simd/src/vector.rs             | 87 ++++++++++++++++++++--
 crates/core_simd/src/vector/ptr.rs         |  4 +
 crates/core_simd/src/vendor.rs             |  2 +
 crates/core_simd/tests/masks.rs            |  4 +-
 crates/core_simd/tests/ops_macros.rs       | 24 +++---
 crates/core_simd/tests/round.rs            | 14 +---
 21 files changed, 440 insertions(+), 125 deletions(-)
 create mode 100644 crates/core_simd/examples/spectral_norm.rs
 create mode 100644 crates/core_simd/src/masks/to_bitmask.rs

diff --git a/crates/core_simd/examples/spectral_norm.rs b/crates/core_simd/examples/spectral_norm.rs
new file mode 100644
index 00000000000..c515dad4dea
--- /dev/null
+++ b/crates/core_simd/examples/spectral_norm.rs
@@ -0,0 +1,77 @@
+#![feature(portable_simd)]
+
+use core_simd::simd::*;
+
+fn a(i: usize, j: usize) -> f64 {
+    ((i + j) * (i + j + 1) / 2 + i + 1) as f64
+}
+
+fn mult_av(v: &[f64], out: &mut [f64]) {
+    assert!(v.len() == out.len());
+    assert!(v.len() % 2 == 0);
+
+    for (i, out) in out.iter_mut().enumerate() {
+        let mut sum = f64x2::splat(0.0);
+
+        let mut j = 0;
+        while j < v.len() {
+            let b = f64x2::from_slice(&v[j..]);
+            let a = f64x2::from_array([a(i, j), a(i, j + 1)]);
+            sum += b / a;
+            j += 2
+        }
+        *out = sum.horizontal_sum();
+    }
+}
+
+fn mult_atv(v: &[f64], out: &mut [f64]) {
+    assert!(v.len() == out.len());
+    assert!(v.len() % 2 == 0);
+
+    for (i, out) in out.iter_mut().enumerate() {
+        let mut sum = f64x2::splat(0.0);
+
+        let mut j = 0;
+        while j < v.len() {
+            let b = f64x2::from_slice(&v[j..]);
+            let a = f64x2::from_array([a(j, i), a(j + 1, i)]);
+            sum += b / a;
+            j += 2
+        }
+        *out = sum.horizontal_sum();
+    }
+}
+
+fn mult_atav(v: &[f64], out: &mut [f64], tmp: &mut [f64]) {
+    mult_av(v, tmp);
+    mult_atv(tmp, out);
+}
+
+pub fn spectral_norm(n: usize) -> f64 {
+    assert!(n % 2 == 0, "only even lengths are accepted");
+
+    let mut u = vec![1.0; n];
+    let mut v = u.clone();
+    let mut tmp = u.clone();
+
+    for _ in 0..10 {
+        mult_atav(&u, &mut v, &mut tmp);
+        mult_atav(&v, &mut u, &mut tmp);
+    }
+    (dot(&u, &v) / dot(&v, &v)).sqrt()
+}
+
+fn dot(x: &[f64], y: &[f64]) -> f64 {
+    // This is auto-vectorized:
+    x.iter().zip(y).map(|(&x, &y)| x * y).sum()
+}
+
+#[cfg(test)]
+#[test]
+fn test() {
+    assert_eq!(&format!("{:.9}", spectral_norm(100)), "1.274219991");
+}
+
+fn main() {
+    // Empty main to make cargo happy
+}
diff --git a/crates/core_simd/src/comparisons.rs b/crates/core_simd/src/comparisons.rs
index edef5af3687..d024cf4ddbe 100644
--- a/crates/core_simd/src/comparisons.rs
+++ b/crates/core_simd/src/comparisons.rs
@@ -10,6 +10,8 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_eq(self, other: Self) -> Mask<T::Mask, LANES> {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
         unsafe { Mask::from_int_unchecked(intrinsics::simd_eq(self, other)) }
     }
 
@@ -17,6 +19,8 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_ne(self, other: Self) -> Mask<T::Mask, LANES> {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
         unsafe { Mask::from_int_unchecked(intrinsics::simd_ne(self, other)) }
     }
 }
@@ -30,6 +34,8 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_lt(self, other: Self) -> Mask<T::Mask, LANES> {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
         unsafe { Mask::from_int_unchecked(intrinsics::simd_lt(self, other)) }
     }
 
@@ -37,6 +43,8 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_gt(self, other: Self) -> Mask<T::Mask, LANES> {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
         unsafe { Mask::from_int_unchecked(intrinsics::simd_gt(self, other)) }
     }
 
@@ -44,6 +52,8 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_le(self, other: Self) -> Mask<T::Mask, LANES> {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
         unsafe { Mask::from_int_unchecked(intrinsics::simd_le(self, other)) }
     }
 
@@ -51,6 +61,8 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_ge(self, other: Self) -> Mask<T::Mask, LANES> {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
         unsafe { Mask::from_int_unchecked(intrinsics::simd_ge(self, other)) }
     }
 }
diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 233657202f7..e150946c705 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -2,31 +2,55 @@
 //! crate.
 //!
 //! The LLVM assembly language is documented here: <https://llvm.org/docs/LangRef.html>
+//!
+//! A quick glossary of jargon that may appear in this module, mostly paraphrasing LLVM's LangRef:
+//! - poison: "undefined behavior as a value". specifically, it is like uninit memory (such as padding bytes). it is "safe" to create poison, BUT
+//!   poison MUST NOT be observed from safe code, as operations on poison return poison, like NaN. unlike NaN, which has defined comparisons,
+//!   poison is neither true nor false, and LLVM may also convert it to undef (at which point it is both). so, it can't be conditioned on, either.
+//! - undef: "a value that is every value". functionally like poison, insofar as Rust is concerned. poison may become this. note:
+//!   this means that division by poison or undef is like division by zero, which means it inflicts...
+//! - "UB": poison and undef cover most of what people call "UB". "UB" means this operation immediately invalidates the program:
+//!   LLVM is allowed to lower it to `ud2` or other opcodes that may cause an illegal instruction exception, and this is the "good end".
+//!   The "bad end" is that LLVM may reverse time to the moment control flow diverged on a path towards undefined behavior,
+//!   and destroy the other branch, potentially deleting safe code and violating Rust's `unsafe` contract.
+//!
+//! Note that according to LLVM, vectors are not arrays, but they are equivalent when stored to and loaded from memory.
+//!
+//! Unless stated otherwise, all intrinsics for binary operations require SIMD vectors of equal types and lengths.
 
 /// These intrinsics aren't linked directly from LLVM and are mostly undocumented, however they are
-/// simply lowered to the matching LLVM instructions by the compiler.  The associated instruction
-/// is documented alongside each intrinsic.
+/// mostly lowered to the matching LLVM instructions by the compiler in a fairly straightforward manner.
+/// The associated LLVM instruction or intrinsic is documented alongside each Rust intrinsic function.
 extern "platform-intrinsic" {
     /// add/fadd
     pub(crate) fn simd_add<T>(x: T, y: T) -> T;
 
     /// sub/fsub
-    pub(crate) fn simd_sub<T>(x: T, y: T) -> T;
+    pub(crate) fn simd_sub<T>(lhs: T, rhs: T) -> T;
 
     /// mul/fmul
     pub(crate) fn simd_mul<T>(x: T, y: T) -> T;
 
     /// udiv/sdiv/fdiv
-    pub(crate) fn simd_div<T>(x: T, y: T) -> T;
+    /// ints and uints: {s,u}div incur UB if division by zero occurs.
+    /// ints: sdiv is UB for int::MIN / -1.
+    /// floats: fdiv is never UB, but may create NaNs or infinities.
+    pub(crate) fn simd_div<T>(lhs: T, rhs: T) -> T;
 
     /// urem/srem/frem
-    pub(crate) fn simd_rem<T>(x: T, y: T) -> T;
+    /// ints and uints: {s,u}rem incur UB if division by zero occurs.
+    /// ints: srem is UB for int::MIN / -1.
+    /// floats: frem is equivalent to libm::fmod in the "default" floating point environment, sans errno.
+    pub(crate) fn simd_rem<T>(lhs: T, rhs: T) -> T;
 
     /// shl
-    pub(crate) fn simd_shl<T>(x: T, y: T) -> T;
+    /// for (u)ints. poison if rhs >= lhs::BITS
+    pub(crate) fn simd_shl<T>(lhs: T, rhs: T) -> T;
 
-    /// lshr/ashr
-    pub(crate) fn simd_shr<T>(x: T, y: T) -> T;
+    /// ints: ashr
+    /// uints: lshr
+    /// poison if rhs >= lhs::BITS
+    pub(crate) fn simd_shr<T>(lhs: T, rhs: T) -> T;
 
     /// and
     pub(crate) fn simd_and<T>(x: T, y: T) -> T;
@@ -38,6 +62,9 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_xor<T>(x: T, y: T) -> T;
 
     /// fptoui/fptosi/uitofp/sitofp
+    /// casting floats to integers is truncating, so it is safe to convert values like e.g. 1.5
+    /// but the truncated value must fit in the target type or the result is poison.
+    /// use `simd_as` instead for a cast that performs a saturating conversion.
     pub(crate) fn simd_cast<T, U>(x: T) -> U;
     /// follows Rust's `T as U` semantics, including saturating float casts
     /// which amounts to the same as `simd_cast` for many cases
@@ -45,6 +72,9 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_as<T, U>(x: T) -> U;
 
     /// neg/fneg
+    /// ints: ultimately becomes a call to cg_ssa's BuilderMethods::neg. cg_llvm equates this to `simd_sub(Simd::splat(0), x)`.
+    /// floats: LLVM's fneg, which changes the floating point sign bit. Some arches have instructions for it.
+    /// Rust panics for Neg::neg(int::MIN) due to overflow, but it is not UB in LLVM without `nsw`.
     pub(crate) fn simd_neg<T>(x: T) -> T;
 
     /// fabs
@@ -54,6 +84,7 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_fmin<T>(x: T, y: T) -> T;
     pub(crate) fn simd_fmax<T>(x: T, y: T) -> T;
 
+    // these return Simd<int, N> with the same BITS size as the inputs
     pub(crate) fn simd_eq<T, U>(x: T, y: T) -> U;
     pub(crate) fn simd_ne<T, U>(x: T, y: T) -> U;
     pub(crate) fn simd_lt<T, U>(x: T, y: T) -> U;
@@ -62,19 +93,31 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_ge<T, U>(x: T, y: T) -> U;
 
     // shufflevector
+    // idx: LLVM calls it a "shuffle mask vector constant", a vector of i32s
     pub(crate) fn simd_shuffle<T, U, V>(x: T, y: T, idx: U) -> V;
 
+    /// llvm.masked.gather
+    /// like a loop of pointer reads
+    /// val: vector of values to select if a lane is masked
+    /// ptr: vector of pointers to read from
+    /// mask: a "wide" mask of integers, selects as if simd_select(mask, read(ptr), val)
+    /// note, the LLVM intrinsic accepts a mask vector of <N x i1>
+    /// FIXME: review this if/when we fix up our mask story in general?
     pub(crate) fn simd_gather<T, U, V>(val: T, ptr: U, mask: V) -> T;
+    /// llvm.masked.scatter
+    /// like gather, but more spicy, as it writes instead of reads
     pub(crate) fn simd_scatter<T, U, V>(val: T, ptr: U, mask: V);
 
     // {s,u}add.sat
     pub(crate) fn simd_saturating_add<T>(x: T, y: T) -> T;
 
     // {s,u}sub.sat
-    pub(crate) fn simd_saturating_sub<T>(x: T, y: T) -> T;
+    pub(crate) fn simd_saturating_sub<T>(lhs: T, rhs: T) -> T;
 
     // reductions
+    // llvm.vector.reduce.{add,fadd}
     pub(crate) fn simd_reduce_add_ordered<T, U>(x: T, y: U) -> U;
+    // llvm.vector.reduce.{mul,fmul}
     pub(crate) fn simd_reduce_mul_ordered<T, U>(x: T, y: U) -> U;
     #[allow(unused)]
     pub(crate) fn simd_reduce_all<T>(x: T) -> bool;
@@ -91,7 +134,10 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_bitmask<T, U>(x: T) -> U;
 
     // select
-    pub(crate) fn simd_select<M, T>(m: M, a: T, b: T) -> T;
+    // first argument is a vector of integers, -1 (all bits 1) is "true"
+    // logically equivalent to (yes & m) | (no & (m^-1),
+    // but you can use it on floats.
+    pub(crate) fn simd_select<M, T>(m: M, yes: T, no: T) -> T;
     #[allow(unused)]
-    pub(crate) fn simd_select_bitmask<M, T>(m: M, a: T, b: T) -> T;
+    pub(crate) fn simd_select_bitmask<M, T>(m: M, yes: T, no: T) -> T;
 }
diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 960a6640083..91ae34c05e0 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -1,7 +1,9 @@
 #![cfg_attr(not(feature = "std"), no_std)]
 #![feature(
     const_fn_trait_bound,
+    convert_float_to_int,
     decl_macro,
+    intra_doc_pointers,
     platform_intrinsics,
     repr_simd,
     simd_ffi,
diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index ae1fef53da8..e1cd7930450 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -12,8 +12,10 @@
 )]
 mod mask_impl;
 
-use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+mod to_bitmask;
+pub use to_bitmask::ToBitMask;
+
+use crate::simd::{intrinsics, LaneCount, Simd, SimdElement, SupportedLaneCount};
 use core::cmp::Ordering;
 use core::{fmt, mem};
 
@@ -42,6 +44,9 @@ mod sealed {
 use sealed::Sealed;
 
 /// Marker trait for types that may be used as SIMD mask elements.
+///
+/// # Safety
+/// Type must be a signed integer.
 pub unsafe trait MaskElement: SimdElement + Sealed {}
 
 macro_rules! impl_element {
@@ -149,6 +154,7 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub unsafe fn from_int_unchecked(value: Simd<T, LANES>) -> Self {
+        // Safety: the caller must confirm this invariant
         unsafe { Self(mask_impl::Mask::from_int_unchecked(value)) }
     }
 
@@ -161,6 +167,7 @@ where
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn from_int(value: Simd<T, LANES>) -> Self {
         assert!(T::valid(value), "all values must be either 0 or -1",);
+        // Safety: the validity has been checked
         unsafe { Self::from_int_unchecked(value) }
     }
 
@@ -179,6 +186,7 @@ where
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
     pub unsafe fn test_unchecked(&self, lane: usize) -> bool {
+        // Safety: the caller must confirm this invariant
         unsafe { self.0.test_unchecked(lane) }
     }
 
@@ -190,6 +198,7 @@ where
     #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn test(&self, lane: usize) -> bool {
         assert!(lane < LANES, "lane index out of range");
+        // Safety: the lane index has been checked
         unsafe { self.test_unchecked(lane) }
     }
 
@@ -199,6 +208,7 @@ where
     /// `lane` must be less than `LANES`.
     #[inline]
     pub unsafe fn set_unchecked(&mut self, lane: usize, value: bool) {
+        // Safety: the caller must confirm this invariant
         unsafe {
             self.0.set_unchecked(lane, value);
         }
@@ -211,27 +221,12 @@ where
     #[inline]
     pub fn set(&mut self, lane: usize, value: bool) {
         assert!(lane < LANES, "lane index out of range");
+        // Safety: the lane index has been checked
         unsafe {
             self.set_unchecked(lane, value);
         }
     }
 
-    /// Convert this mask to a bitmask, with one bit set per lane.
-    #[cfg(feature = "generic_const_exprs")]
-    #[inline]
-    #[must_use = "method returns a new array and does not mutate the original value"]
-    pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
-        self.0.to_bitmask()
-    }
-
-    /// Convert a bitmask to a mask.
-    #[cfg(feature = "generic_const_exprs")]
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask(bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN]) -> Self {
-        Self(mask_impl::Mask::from_bitmask(bitmask))
-    }
-
     /// Returns true if any lane is set, or false otherwise.
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index b4217dc87ba..ec4dd357ee9 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -1,7 +1,7 @@
 #![allow(unused_imports)]
 use super::MaskElement;
 use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+use crate::simd::{LaneCount, Simd, SupportedLaneCount, ToBitMask};
 use core::marker::PhantomData;
 
 /// A mask where each lane is represented by a single bit.
@@ -115,20 +115,22 @@ where
         unsafe { Self(intrinsics::simd_bitmask(value), PhantomData) }
     }
 
-    #[cfg(feature = "generic_const_exprs")]
     #[inline]
-    #[must_use = "method returns a new array and does not mutate the original value"]
-    pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
-        // Safety: these are the same type and we are laundering the generic
+    pub fn to_bitmask_integer<U>(self) -> U
+    where
+        super::Mask<T, LANES>: ToBitMask<BitMask = U>,
+    {
+        // Safety: these are the same types
         unsafe { core::mem::transmute_copy(&self.0) }
     }
 
-    #[cfg(feature = "generic_const_exprs")]
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask(bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN]) -> Self {
-        // Safety: these are the same type and we are laundering the generic
-        Self(unsafe { core::mem::transmute_copy(&bitmask) }, PhantomData)
+    pub fn from_bitmask_integer<U>(bitmask: U) -> Self
+    where
+        super::Mask<T, LANES>: ToBitMask<BitMask = U>,
+    {
+        // Safety: these are the same types
+        unsafe { Self(core::mem::transmute_copy(&bitmask), PhantomData) }
     }
 
     #[inline]
@@ -137,6 +139,7 @@ where
     where
         U: MaskElement,
     {
+        // Safety: bitmask layout does not depend on the element width
         unsafe { core::mem::transmute_copy(&self) }
     }
 
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index e5bb784bb91..8bbdf637de8 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -2,7 +2,7 @@
 
 use super::MaskElement;
 use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+use crate::simd::{LaneCount, Simd, SupportedLaneCount, ToBitMask};
 
 #[repr(transparent)]
 pub struct Mask<T, const LANES: usize>(Simd<T, LANES>)
@@ -66,6 +66,23 @@ where
     }
 }
 
+// Used for bitmask bit order workaround
+pub(crate) trait ReverseBits {
+    fn reverse_bits(self) -> Self;
+}
+
+macro_rules! impl_reverse_bits {
+    { $($int:ty),* } => {
+        $(
+        impl ReverseBits for $int {
+            fn reverse_bits(self) -> Self { <$int>::reverse_bits(self) }
+        }
+        )*
+    }
+}
+
+impl_reverse_bits! { u8, u16, u32, u64 }
+
 impl<T, const LANES: usize> Mask<T, LANES>
 where
     T: MaskElement,
@@ -106,44 +123,40 @@ where
     where
         U: MaskElement,
     {
+        // Safety: masks are simply integer vectors of 0 and -1, and we can cast the element type.
         unsafe { Mask(intrinsics::simd_cast(self.0)) }
     }
 
-    #[cfg(feature = "generic_const_exprs")]
     #[inline]
-    #[must_use = "method returns a new array and does not mutate the original value"]
-    pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
-        unsafe {
-            let mut bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN] =
-                intrinsics::simd_bitmask(self.0);
-
-            // There is a bug where LLVM appears to implement this operation with the wrong
-            // bit order.
-            // TODO fix this in a better way
-            if cfg!(target_endian = "big") {
-                for x in bitmask.as_mut() {
-                    *x = x.reverse_bits();
-                }
-            }
+    pub(crate) fn to_bitmask_integer<U: ReverseBits>(self) -> U
+    where
+        super::Mask<T, LANES>: ToBitMask<BitMask = U>,
+    {
+        // Safety: U is required to be the appropriate bitmask type
+        let bitmask: U = unsafe { intrinsics::simd_bitmask(self.0) };
 
+        // LLVM assumes bit order should match endianness
+        if cfg!(target_endian = "big") {
+            bitmask.reverse_bits()
+        } else {
             bitmask
         }
     }
 
-    #[cfg(feature = "generic_const_exprs")]
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask(mut bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN]) -> Self {
-        unsafe {
-            // There is a bug where LLVM appears to implement this operation with the wrong
-            // bit order.
-            // TODO fix this in a better way
-            if cfg!(target_endian = "big") {
-                for x in bitmask.as_mut() {
-                    *x = x.reverse_bits();
-                }
-            }
+    pub(crate) fn from_bitmask_integer<U: ReverseBits>(bitmask: U) -> Self
+    where
+        super::Mask<T, LANES>: ToBitMask<BitMask = U>,
+    {
+        // LLVM assumes bit order should match endianness
+        let bitmask = if cfg!(target_endian = "big") {
+            bitmask.reverse_bits()
+        } else {
+            bitmask
+        };
 
+        // Safety: U is required to be the appropriate bitmask type
+        unsafe {
             Self::from_int_unchecked(intrinsics::simd_select_bitmask(
                 bitmask,
                 Self::splat(true).to_int(),
@@ -155,12 +168,14 @@ where
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn any(self) -> bool {
+        // Safety: use `self` as an integer vector
         unsafe { intrinsics::simd_reduce_any(self.to_int()) }
     }
 
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original value"]
     pub fn all(self) -> bool {
+        // Safety: use `self` as an integer vector
         unsafe { intrinsics::simd_reduce_all(self.to_int()) }
     }
 }
@@ -184,6 +199,7 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(self, rhs: Self) -> Self {
+        // Safety: `self` is an integer vector
         unsafe { Self(intrinsics::simd_and(self.0, rhs.0)) }
     }
 }
@@ -197,6 +213,7 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(self, rhs: Self) -> Self {
+        // Safety: `self` is an integer vector
         unsafe { Self(intrinsics::simd_or(self.0, rhs.0)) }
     }
 }
@@ -210,6 +227,7 @@ where
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(self, rhs: Self) -> Self {
+        // Safety: `self` is an integer vector
         unsafe { Self(intrinsics::simd_xor(self.0, rhs.0)) }
     }
 }
diff --git a/crates/core_simd/src/masks/to_bitmask.rs b/crates/core_simd/src/masks/to_bitmask.rs
new file mode 100644
index 00000000000..1c2037764c1
--- /dev/null
+++ b/crates/core_simd/src/masks/to_bitmask.rs
@@ -0,0 +1,57 @@
+use super::{mask_impl, Mask, MaskElement};
+use crate::simd::{LaneCount, SupportedLaneCount};
+
+mod sealed {
+    pub trait Sealed {}
+}
+pub use sealed::Sealed;
+
+impl<T, const LANES: usize> Sealed for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+}
+
+/// Converts masks to and from integer bitmasks.
+///
+/// Each bit of the bitmask corresponds to a mask lane, starting with the LSB.
+///
+/// # Safety
+/// This trait is `unsafe` and sealed, since the `BitMask` type must match the number of lanes in
+/// the mask.
+pub unsafe trait ToBitMask: Sealed {
+    /// The integer bitmask type.
+    type BitMask;
+
+    /// Converts a mask to a bitmask.
+    fn to_bitmask(self) -> Self::BitMask;
+
+    /// Converts a bitmask to a mask.
+    fn from_bitmask(bitmask: Self::BitMask) -> Self;
+}
+
+macro_rules! impl_integer_intrinsic {
+    { $(unsafe impl ToBitMask<BitMask=$int:ty> for Mask<_, $lanes:literal>)* } => {
+        $(
+        unsafe impl<T: MaskElement> ToBitMask for Mask<T, $lanes> {
+            type BitMask = $int;
+
+            fn to_bitmask(self) -> $int {
+                self.0.to_bitmask_integer()
+            }
+
+            fn from_bitmask(bitmask: $int) -> Self {
+                Self(mask_impl::Mask::from_bitmask_integer(bitmask))
+            }
+        }
+        )*
+    }
+}
+
+impl_integer_intrinsic! {
+    unsafe impl ToBitMask<BitMask=u8> for Mask<_, 8>
+    unsafe impl ToBitMask<BitMask=u16> for Mask<_, 16>
+    unsafe impl ToBitMask<BitMask=u32> for Mask<_, 32>
+    unsafe impl ToBitMask<BitMask=u64> for Mask<_, 64>
+}
diff --git a/crates/core_simd/src/math.rs b/crates/core_simd/src/math.rs
index 7435b6df918..0b4e40983af 100644
--- a/crates/core_simd/src/math.rs
+++ b/crates/core_simd/src/math.rs
@@ -22,6 +22,7 @@ macro_rules! impl_uint_arith {
             /// ```
             #[inline]
             pub fn saturating_add(self, second: Self) -> Self {
+                // Safety: `self` is a vector
                 unsafe { simd_saturating_add(self, second) }
             }
 
@@ -41,6 +42,7 @@ macro_rules! impl_uint_arith {
             /// assert_eq!(sat, Simd::splat(0));
             #[inline]
             pub fn saturating_sub(self, second: Self) -> Self {
+                // Safety: `self` is a vector
                 unsafe { simd_saturating_sub(self, second) }
             }
         })+
@@ -68,6 +70,7 @@ macro_rules! impl_int_arith {
             /// ```
             #[inline]
             pub fn saturating_add(self, second: Self) -> Self {
+                // Safety: `self` is a vector
                 unsafe { simd_saturating_add(self, second) }
             }
 
@@ -87,6 +90,7 @@ macro_rules! impl_int_arith {
             /// assert_eq!(sat, Simd::from_array([MIN, MIN, MIN, 0]));
             #[inline]
             pub fn saturating_sub(self, second: Self) -> Self {
+                // Safety: `self` is a vector
                 unsafe { simd_saturating_sub(self, second) }
             }
 
diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index b65038933bf..1b35b3e717a 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -57,29 +57,40 @@ macro_rules! wrap_bitshift {
     };
 }
 
-// Division by zero is poison, according to LLVM.
-// So is dividing the MIN value of a signed integer by -1,
-// since that would return MAX + 1.
-// FIXME: Rust allows <SInt>::MIN / -1,
-// so we should probably figure out how to make that safe.
+/// SAFETY: This macro must only be used to impl Div or Rem and given the matching intrinsic.
+/// It guards against LLVM's UB conditions for integer div or rem using masks and selects,
+/// thus guaranteeing a Rust value returns instead.
+///
+/// |                  | LLVM | Rust
+/// | :--------------: | :--- | :----------
+/// | N {/,%} 0        | UB   | panic!()
+/// | <$int>::MIN / -1 | UB   | <$int>::MIN
+/// | <$int>::MIN % -1 | UB   | 0
+///
 macro_rules! int_divrem_guard {
     (   $lhs:ident,
         $rhs:ident,
         {   const PANIC_ZERO: &'static str = $zero:literal;
-            const PANIC_OVERFLOW: &'static str = $overflow:literal;
             $simd_call:ident
         },
         $int:ident ) => {
         if $rhs.lanes_eq(Simd::splat(0)).any() {
             panic!($zero);
-        } else if <$int>::MIN != 0
-            && ($lhs.lanes_eq(Simd::splat(<$int>::MIN))
-                // type inference can break here, so cut an SInt to size
-                & $rhs.lanes_eq(Simd::splat(-1i64 as _))).any()
-        {
-            panic!($overflow);
         } else {
-            unsafe { $crate::simd::intrinsics::$simd_call($lhs, $rhs) }
+            // Prevent otherwise-UB overflow on the MIN / -1 case.
+            let rhs = if <$int>::MIN != 0 {
+                // This should, at worst, optimize to a few branchless logical ops
+                // Ideally, this entire conditional should evaporate
+                // Fire LLVM and implement those manually if it doesn't get the hint
+                ($lhs.lanes_eq(Simd::splat(<$int>::MIN))
+                // type inference can break here, so cut an SInt to size
+                & $rhs.lanes_eq(Simd::splat(-1i64 as _)))
+                .select(Simd::splat(1), $rhs)
+            } else {
+                // Nice base case to make it easy to const-fold away the other branch.
+                $rhs
+            };
+            unsafe { $crate::simd::intrinsics::$simd_call($lhs, rhs) }
         }
     };
 }
@@ -183,7 +194,6 @@ for_base_ops! {
     impl Div::div {
         int_divrem_guard {
             const PANIC_ZERO: &'static str = "attempt to divide by zero";
-            const PANIC_OVERFLOW: &'static str = "attempt to divide with overflow";
             simd_div
         }
     }
@@ -191,7 +201,6 @@ for_base_ops! {
     impl Rem::rem {
         int_divrem_guard {
             const PANIC_ZERO: &'static str = "attempt to calculate the remainder with a divisor of zero";
-            const PANIC_OVERFLOW: &'static str = "attempt to calculate the remainder with overflow";
             simd_rem
         }
     }
diff --git a/crates/core_simd/src/reduction.rs b/crates/core_simd/src/reduction.rs
index e79a185816b..e1cd743e442 100644
--- a/crates/core_simd/src/reduction.rs
+++ b/crates/core_simd/src/reduction.rs
@@ -14,24 +14,28 @@ macro_rules! impl_integer_reductions {
             /// Horizontal wrapping add.  Returns the sum of the lanes of the vector, with wrapping addition.
             #[inline]
             pub fn horizontal_sum(self) -> $scalar {
+                // Safety: `self` is an integer vector
                 unsafe { simd_reduce_add_ordered(self, 0) }
             }
 
             /// Horizontal wrapping multiply.  Returns the product of the lanes of the vector, with wrapping multiplication.
             #[inline]
             pub fn horizontal_product(self) -> $scalar {
+                // Safety: `self` is an integer vector
                 unsafe { simd_reduce_mul_ordered(self, 1) }
             }
 
             /// Horizontal maximum.  Returns the maximum lane in the vector.
             #[inline]
             pub fn horizontal_max(self) -> $scalar {
+                // Safety: `self` is an integer vector
                 unsafe { simd_reduce_max(self) }
             }
 
             /// Horizontal minimum.  Returns the minimum lane in the vector.
             #[inline]
             pub fn horizontal_min(self) -> $scalar {
+                // Safety: `self` is an integer vector
                 unsafe { simd_reduce_min(self) }
             }
         }
@@ -63,6 +67,7 @@ macro_rules! impl_float_reductions {
                 if cfg!(all(target_arch = "x86", not(target_feature = "sse2"))) {
                     self.as_array().iter().sum()
                 } else {
+                    // Safety: `self` is a float vector
                     unsafe { simd_reduce_add_ordered(self, 0.) }
                 }
             }
@@ -74,6 +79,7 @@ macro_rules! impl_float_reductions {
                 if cfg!(all(target_arch = "x86", not(target_feature = "sse2"))) {
                     self.as_array().iter().product()
                 } else {
+                    // Safety: `self` is a float vector
                     unsafe { simd_reduce_mul_ordered(self, 1.) }
                 }
             }
@@ -84,6 +90,7 @@ macro_rules! impl_float_reductions {
             /// return either.  This function will not return `NaN` unless all lanes are `NaN`.
             #[inline]
             pub fn horizontal_max(self) -> $scalar {
+                // Safety: `self` is a float vector
                 unsafe { simd_reduce_max(self) }
             }
 
@@ -93,6 +100,7 @@ macro_rules! impl_float_reductions {
             /// return either.  This function will not return `NaN` unless all lanes are `NaN`.
             #[inline]
             pub fn horizontal_min(self) -> $scalar {
+                // Safety: `self` is a float vector
                 unsafe { simd_reduce_min(self) }
             }
         }
diff --git a/crates/core_simd/src/round.rs b/crates/core_simd/src/round.rs
index 06ccab3ec49..556bc2cc1fe 100644
--- a/crates/core_simd/src/round.rs
+++ b/crates/core_simd/src/round.rs
@@ -1,9 +1,10 @@
 use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+use core::convert::FloatToInt;
 
 macro_rules! implement {
     {
-        $type:ty, $int_type:ty
+        $type:ty
     } => {
         impl<const LANES: usize> Simd<$type, LANES>
         where
@@ -18,20 +19,22 @@ macro_rules! implement {
             /// * Not be NaN
             /// * Not be infinite
             /// * Be representable in the return type, after truncating off its fractional part
+            ///
+            /// If these requirements are infeasible or costly, consider using the safe function [cast],
+            /// which saturates on conversion.
+            ///
+            /// [cast]: Simd::cast
             #[inline]
-            pub unsafe fn to_int_unchecked(self) -> Simd<$int_type, LANES> {
+            pub unsafe fn to_int_unchecked<I>(self) -> Simd<I, LANES>
+            where
+                $type: FloatToInt<I>,
+                I: SimdElement,
+            {
                 unsafe { intrinsics::simd_cast(self) }
             }
-
-            /// Creates a floating-point vector from an integer vector.  Rounds values that are
-            /// not exactly representable.
-            #[inline]
-            pub fn round_from_int(value: Simd<$int_type, LANES>) -> Self {
-                unsafe { intrinsics::simd_cast(value) }
-            }
         }
     }
 }
 
-implement! { f32, i32 }
-implement! { f64, i64 }
+implement! { f32 }
+implement! { f64 }
diff --git a/crates/core_simd/src/select.rs b/crates/core_simd/src/select.rs
index 8d521057fbd..3acf07260e1 100644
--- a/crates/core_simd/src/select.rs
+++ b/crates/core_simd/src/select.rs
@@ -11,6 +11,7 @@ where
     /// For each lane in the mask, choose the corresponding lane from `true_values` if
     /// that lane mask is true, and `false_values` if that lane mask is false.
     ///
+    /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "std")] use core_simd::{Simd, Mask};
@@ -31,6 +32,8 @@ where
     where
         U: SimdElement<Mask = T>,
     {
+        // Safety: The mask has been cast to a vector of integers,
+        // and the operands to select between are vectors of the same type and length.
         unsafe { intrinsics::simd_select(self.to_int(), true_values, false_values) }
     }
 
@@ -39,6 +42,7 @@ where
     /// For each lane in the mask, choose the corresponding lane from `true_values` if
     /// that lane mask is true, and `false_values` if that lane mask is false.
     ///
+    /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "std")] use core_simd::Mask;
diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index bdc489774a5..08b2add1166 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -95,6 +95,7 @@ pub trait Swizzle<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
         LaneCount<INPUT_LANES>: SupportedLaneCount,
         LaneCount<OUTPUT_LANES>: SupportedLaneCount,
     {
+        // Safety: `vector` is a vector, and `INDEX_IMPL` is a const array of u32.
         unsafe { intrinsics::simd_shuffle(vector, vector, Self::INDEX_IMPL) }
     }
 }
@@ -119,6 +120,7 @@ pub trait Swizzle2<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
         LaneCount<INPUT_LANES>: SupportedLaneCount,
         LaneCount<OUTPUT_LANES>: SupportedLaneCount,
     {
+        // Safety: `first` and `second` are vectors, and `INDEX_IMPL` is a const array of u32.
         unsafe { intrinsics::simd_shuffle(first, second, Self::INDEX_IMPL) }
     }
 }
diff --git a/crates/core_simd/src/to_bytes.rs b/crates/core_simd/src/to_bytes.rs
index 8d9b3e8ff85..b36b1a347b2 100644
--- a/crates/core_simd/src/to_bytes.rs
+++ b/crates/core_simd/src/to_bytes.rs
@@ -8,12 +8,14 @@ macro_rules! impl_to_bytes {
             /// Return the memory representation of this integer as a byte array in native byte
             /// order.
             pub fn to_ne_bytes(self) -> crate::simd::Simd<u8, {{ $size * LANES }}> {
+                // Safety: transmuting between vectors is safe
                 unsafe { core::mem::transmute_copy(&self) }
             }
 
             /// Create a native endian integer value from its memory representation as a byte array
             /// in native endianness.
             pub fn from_ne_bytes(bytes: crate::simd::Simd<u8, {{ $size * LANES }}>) -> Self {
+                // Safety: transmuting between vectors is safe
                 unsafe { core::mem::transmute_copy(&bytes) }
             }
         }
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index b7ef7a56c73..ff1b2c756ad 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -12,7 +12,79 @@ pub(crate) mod ptr;
 use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Mask, MaskElement, SupportedLaneCount};
 
-/// A SIMD vector of `LANES` elements of type `T`.
+/// A SIMD vector of `LANES` elements of type `T`. `Simd<T, N>` has the same shape as [`[T; N]`](array), but operates like `T`.
+///
+/// Two vectors of the same type and length will, by convention, support the operators (+, *, etc.) that `T` does.
+/// These take the lanes at each index on the left-hand side and right-hand side, perform the operation,
+/// and return the result in the same lane in a vector of equal size. For a given operator, this is equivalent to zipping
+/// the two arrays together and mapping the operator over each lane.
+///
+/// ```rust
+/// # #![feature(array_zip, portable_simd)]
+/// # use core::simd::{Simd};
+/// let a0: [i32; 4] = [-2, 0, 2, 4];
+/// let a1 = [10, 9, 8, 7];
+/// let zm_add = a0.zip(a1).map(|(lhs, rhs)| lhs + rhs);
+/// let zm_mul = a0.zip(a1).map(|(lhs, rhs)| lhs * rhs);
+///
+/// // `Simd<T, N>` implements `From<[T; N]>
+/// let (v0, v1) = (Simd::from(a0), Simd::from(a1));
+/// // Which means arrays implement `Into<Simd<T, N>>`.
+/// assert_eq!(v0 + v1, zm_add.into());
+/// assert_eq!(v0 * v1, zm_mul.into());
+/// ```
+///
+/// `Simd` with integers has the quirk that these operations are also inherently wrapping, as if `T` was [`Wrapping<T>`].
+/// Thus, `Simd` does not implement `wrapping_add`, because that is the default behavior.
+/// This means there is no warning on overflows, even in "debug" builds.
+/// For most applications where `Simd` is appropriate, it is "not a bug" to wrap,
+/// and even "debug builds" are unlikely to tolerate the loss of performance.
+/// You may want to consider using explicitly checked arithmetic if such is required.
+/// Division by zero still causes a panic, so you may want to consider using floating point numbers if that is unacceptable.
+///
+/// [`Wrapping<T>`]: core::num::Wrapping
+///
+/// # Layout
+/// `Simd<T, N>` has a layout similar to `[T; N]` (identical "shapes"), but with a greater alignment.
+/// `[T; N]` is aligned to `T`, but `Simd<T, N>` will have an alignment based on both `T` and `N`.
+/// It is thus sound to [`transmute`] `Simd<T, N>` to `[T; N]`, and will typically optimize to zero cost,
+/// but the reverse transmutation is more likely to require a copy the compiler cannot simply elide.
+///
+/// # ABI "Features"
+/// Due to Rust's safety guarantees, `Simd<T, N>` is currently passed to and from functions via memory, not SIMD registers,
+/// except as an optimization. `#[inline]` hints are recommended on functions that accept `Simd<T, N>` or return it.
+/// The need for this may be corrected in the future.
+///
+/// # Safe SIMD with Unsafe Rust
+///
+/// Operations with `Simd` are typically safe, but there are many reasons to want to combine SIMD with `unsafe` code.
+/// Care must be taken to respect differences between `Simd` and other types it may be transformed into or derived from.
+/// In particular, the layout of `Simd<T, N>` may be similar to `[T; N]`, and may allow some transmutations,
+/// but references to `[T; N]` are not interchangeable with those to `Simd<T, N>`.
+/// Thus, when using `unsafe` Rust to read and write `Simd<T, N>` through [raw pointers], it is a good idea to first try with
+/// [`read_unaligned`] and [`write_unaligned`]. This is because:
+/// - [`read`] and [`write`] require full alignment (in this case, `Simd<T, N>`'s alignment)
+/// - the likely source for reading or destination for writing `Simd<T, N>` is [`[T]`](slice) and similar types, aligned to `T`
+/// - combining these actions would violate the `unsafe` contract and explode the program into a puff of **undefined behavior**
+/// - the compiler can implicitly adjust layouts to make unaligned reads or writes fully aligned if it sees the optimization
+/// - most contemporary processors suffer no performance penalty for "unaligned" reads and writes that are aligned at runtime
+///
+/// By imposing less obligations, unaligned functions are less likely to make the program unsound,
+/// and may be just as fast as stricter alternatives.
+/// When trying to guarantee alignment, [`[T]::as_simd`][as_simd] is an option for converting `[T]` to `[Simd<T, N>]`,
+/// and allows soundly operating on an aligned SIMD body, but it may cost more time when handling the scalar head and tail.
+/// If these are not sufficient, then it is most ideal to design data structures to be already aligned
+/// to the `Simd<T, N>` you wish to use before using `unsafe` Rust to read or write.
+/// More conventional ways to compensate for these facts, like materializing `Simd` to or from an array first,
+/// are handled by safe methods like [`Simd::from_array`] and [`Simd::from_slice`].
+///
+/// [`transmute`]: core::mem::transmute
+/// [raw pointers]: pointer
+/// [`read_unaligned`]: pointer::read_unaligned
+/// [`write_unaligned`]: pointer::write_unaligned
+/// [`read`]: pointer::read
+/// [`write`]: pointer::write
+/// [as_simd]: slice::as_simd
 #[repr(simd)]
 pub struct Simd<T, const LANES: usize>([T; LANES])
 where
@@ -102,6 +174,7 @@ where
     #[inline]
     #[cfg(not(bootstrap))]
     pub fn cast<U: SimdElement>(self) -> Simd<U, LANES> {
+        // Safety: The input argument is a vector of a known SIMD type.
         unsafe { intrinsics::simd_as(self) }
     }
 
@@ -175,7 +248,7 @@ where
         or: Self,
     ) -> Self {
         let enable: Mask<isize, LANES> = enable & idxs.lanes_lt(Simd::splat(slice.len()));
-        // SAFETY: We have masked-off out-of-bounds lanes.
+        // Safety: We have masked-off out-of-bounds lanes.
         unsafe { Self::gather_select_unchecked(slice, enable, idxs, or) }
     }
 
@@ -216,7 +289,7 @@ where
         let base_ptr = crate::simd::ptr::SimdConstPtr::splat(slice.as_ptr());
         // Ferris forgive me, I have done pointer arithmetic here.
         let ptrs = base_ptr.wrapping_add(idxs);
-        // SAFETY: The ptrs have been bounds-masked to prevent memory-unsafe reads insha'allah
+        // Safety: The ptrs have been bounds-masked to prevent memory-unsafe reads insha'allah
         unsafe { intrinsics::simd_gather(or, ptrs, enable.to_int()) }
     }
 
@@ -268,7 +341,7 @@ where
         idxs: Simd<usize, LANES>,
     ) {
         let enable: Mask<isize, LANES> = enable & idxs.lanes_lt(Simd::splat(slice.len()));
-        // SAFETY: We have masked-off out-of-bounds lanes.
+        // Safety: We have masked-off out-of-bounds lanes.
         unsafe { self.scatter_select_unchecked(slice, enable, idxs) }
     }
 
@@ -307,7 +380,7 @@ where
         enable: Mask<isize, LANES>,
         idxs: Simd<usize, LANES>,
     ) {
-        // SAFETY: This block works with *mut T derived from &mut 'a [T],
+        // Safety: This block works with *mut T derived from &mut 'a [T],
         // which means it is delicate in Rust's borrowing model, circa 2021:
         // &mut 'a [T] asserts uniqueness, so deriving &'a [T] invalidates live *mut Ts!
         // Even though this block is largely safe methods, it must be exactly this way
@@ -487,7 +560,9 @@ mod sealed {
 use sealed::Sealed;
 
 /// Marker trait for types that may be used as SIMD vector elements.
-/// SAFETY: This trait, when implemented, asserts the compiler can monomorphize
+///
+/// # Safety
+/// This trait, when implemented, asserts the compiler can monomorphize
 /// `#[repr(simd)]` structs with the marked type as an element.
 /// Strictly, it is valid to impl if the vector will not be miscompiled.
 /// Practically, it is user-unfriendly to impl it if the vector won't compile,
diff --git a/crates/core_simd/src/vector/ptr.rs b/crates/core_simd/src/vector/ptr.rs
index c668d9a6eae..417d255c28d 100644
--- a/crates/core_simd/src/vector/ptr.rs
+++ b/crates/core_simd/src/vector/ptr.rs
@@ -21,6 +21,8 @@ where
     #[inline]
     #[must_use]
     pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
+        // Safety: converting pointers to usize and vice-versa is safe
+        // (even if using that pointer is not)
         unsafe {
             let x: Simd<usize, LANES> = mem::transmute_copy(&self);
             mem::transmute_copy(&{ x + (addend * Simd::splat(mem::size_of::<T>())) })
@@ -47,6 +49,8 @@ where
     #[inline]
     #[must_use]
     pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
+        // Safety: converting pointers to usize and vice-versa is safe
+        // (even if using that pointer is not)
         unsafe {
             let x: Simd<usize, LANES> = mem::transmute_copy(&self);
             mem::transmute_copy(&{ x + (addend * Simd::splat(mem::size_of::<T>())) })
diff --git a/crates/core_simd/src/vendor.rs b/crates/core_simd/src/vendor.rs
index e8ce7176b4f..9fb70218c95 100644
--- a/crates/core_simd/src/vendor.rs
+++ b/crates/core_simd/src/vendor.rs
@@ -9,6 +9,8 @@ macro_rules! from_transmute {
         impl core::convert::From<$from> for $to {
             #[inline]
             fn from(value: $from) -> $to {
+                // Safety: transmuting between vectors is safe, but the caller of this macro
+                // checks the invariants
                 unsafe { core::mem::transmute(value) }
             }
         }
diff --git a/crates/core_simd/tests/masks.rs b/crates/core_simd/tests/masks.rs
index 6a8ecd33a73..3aec36ca7b7 100644
--- a/crates/core_simd/tests/masks.rs
+++ b/crates/core_simd/tests/masks.rs
@@ -68,16 +68,16 @@ macro_rules! test_mask_api {
                 assert_eq!(core_simd::Mask::<$type, 8>::from_int(int), mask);
             }
 
-            #[cfg(feature = "generic_const_exprs")]
             #[test]
             fn roundtrip_bitmask_conversion() {
+                use core_simd::ToBitMask;
                 let values = [
                     true, false, false, true, false, false, true, false,
                     true, true, false, false, false, false, false, true,
                 ];
                 let mask = core_simd::Mask::<$type, 16>::from_array(values);
                 let bitmask = mask.to_bitmask();
-                assert_eq!(bitmask, [0b01001001, 0b10000011]);
+                assert_eq!(bitmask, 0b1000001101001001);
                 assert_eq!(core_simd::Mask::<$type, 16>::from_bitmask(bitmask), mask);
             }
         }
diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index 4fb9de198ee..50f7a4ca170 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -210,15 +210,21 @@ macro_rules! impl_signed_tests {
                     )
                 }
 
+                fn div_min_may_overflow<const LANES: usize>() {
+                    let a = Vector::<LANES>::splat(Scalar::MIN);
+                    let b = Vector::<LANES>::splat(-1);
+                    assert_eq!(a / b, a);
+                }
+
+                fn rem_min_may_overflow<const LANES: usize>() {
+                    let a = Vector::<LANES>::splat(Scalar::MIN);
+                    let b = Vector::<LANES>::splat(-1);
+                    assert_eq!(a % b, Vector::<LANES>::splat(0));
+                }
+
             }
 
             test_helpers::test_lanes_panic! {
-                fn div_min_overflow_panics<const LANES: usize>() {
-                    let a = Vector::<LANES>::splat(Scalar::MIN);
-                    let b = Vector::<LANES>::splat(-1);
-                    let _ = a / b;
-                }
-
                 fn div_by_all_zeros_panics<const LANES: usize>() {
                     let a = Vector::<LANES>::splat(42);
                     let b = Vector::<LANES>::splat(0);
@@ -232,12 +238,6 @@ macro_rules! impl_signed_tests {
                     let _ = a / b;
                 }
 
-                fn rem_min_overflow_panic<const LANES: usize>() {
-                    let a = Vector::<LANES>::splat(Scalar::MIN);
-                    let b = Vector::<LANES>::splat(-1);
-                    let _ = a % b;
-                }
-
                 fn rem_zero_panic<const LANES: usize>() {
                     let a = Vector::<LANES>::splat(42);
                     let b = Vector::<LANES>::splat(0);
diff --git a/crates/core_simd/tests/round.rs b/crates/core_simd/tests/round.rs
index 1a1bc9ebca7..53732329237 100644
--- a/crates/core_simd/tests/round.rs
+++ b/crates/core_simd/tests/round.rs
@@ -53,14 +53,6 @@ macro_rules! float_rounding_test {
             }
 
             test_helpers::test_lanes! {
-                fn from_int<const LANES: usize>() {
-                    test_helpers::test_unary_elementwise(
-                        &Vector::<LANES>::round_from_int,
-                        &|x| x as Scalar,
-                        &|_| true,
-                    )
-                }
-
                 fn to_int_unchecked<const LANES: usize>() {
                     // The maximum integer that can be represented by the equivalently sized float has
                     // all of the mantissa digits set to 1, pushed up to the MSB.
@@ -72,11 +64,11 @@ macro_rules! float_rounding_test {
                     runner.run(
                         &test_helpers::array::UniformArrayStrategy::new(-MAX_REPRESENTABLE_VALUE..MAX_REPRESENTABLE_VALUE),
                         |x| {
-                            let result_1 = unsafe { Vector::from_array(x).to_int_unchecked().to_array() };
+                            let result_1 = unsafe { Vector::from_array(x).to_int_unchecked::<IntScalar>().to_array() };
                             let result_2 = {
-                                let mut result = [0; LANES];
+                                let mut result: [IntScalar; LANES] = [0; LANES];
                                 for (i, o) in x.iter().zip(result.iter_mut()) {
-                                    *o = unsafe { i.to_int_unchecked() };
+                                    *o = unsafe { i.to_int_unchecked::<IntScalar>() };
                                 }
                                 result
                             };

From 700972b1a37737274da1a0932fa51b66a003b19c Mon Sep 17 00:00:00 2001
From: Guillaume Gomez <guillaume.gomez@huawei.com>
Date: Wed, 2 Mar 2022 17:58:33 +0100
Subject: [PATCH 06/35] Fix unused_doc_comments lint errors

---
 crates/core_simd/src/intrinsics.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 4c68d11e893..cf2c0a02351 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -18,9 +18,10 @@
 //!
 //! Unless stated otherwise, all intrinsics for binary operations require SIMD vectors of equal types and lengths.
 
-/// These intrinsics aren't linked directly from LLVM and are mostly undocumented, however they are
-/// mostly lowered to the matching LLVM instructions by the compiler in a fairly straightforward manner.
-/// The associated LLVM instruction or intrinsic is documented alongside each Rust intrinsic function.
+
+// These intrinsics aren't linked directly from LLVM and are mostly undocumented, however they are
+// mostly lowered to the matching LLVM instructions by the compiler in a fairly straightforward manner.
+// The associated LLVM instruction or intrinsic is documented alongside each Rust intrinsic function.
 extern "platform-intrinsic" {
     /// add/fadd
     pub(crate) fn simd_add<T>(x: T, y: T) -> T;

From c43129f8374bc5e058efbc8c6a7f143df1d3754f Mon Sep 17 00:00:00 2001
From: T-O-R-U-S <bageliq@protonmail.com>
Date: Sat, 12 Feb 2022 23:16:17 +0400
Subject: [PATCH 07/35] Use implicit capture syntax in format_args

This updates the standard library's documentation to use the new syntax. The
documentation is worthwhile to update as it should be more idiomatic
(particularly for features like this, which are nice for users to get acquainted
with). The general codebase is likely more hassle than benefit to update: it'll
hurt git blame, and generally updates can be done by folks updating the code if
(and when) that makes things more readable with the new format.

A few places in the compiler and library code are updated (mostly just due to
already having been done when this commit was first authored).
---
 crates/core_simd/examples/nbody.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/core_simd/examples/nbody.rs b/crates/core_simd/examples/nbody.rs
index 7b1e6840f64..b16b952f71e 100644
--- a/crates/core_simd/examples/nbody.rs
+++ b/crates/core_simd/examples/nbody.rs
@@ -187,7 +187,7 @@ mod tests {
 fn main() {
     {
         let (energy_before, energy_after) = nbody::run(1000);
-        println!("Energy before: {}", energy_before);
-        println!("Energy after:  {}", energy_after);
+        println!("Energy before: {energy_before}");
+        println!("Energy after:  {energy_after}");
     }
 }

From 94c7da04b4fab1636ba42551c9a9b4455e342c71 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Sat, 12 Mar 2022 16:09:37 -0800
Subject: [PATCH 08/35] Sync portable-simd to
 rust-lang/portable-simd@72df4c45056a8bc0d1b3f06fdc828722177f0763

---
 beginners-guide.md                            |  4 +-
 crates/core_simd/Cargo.toml                   |  2 +-
 crates/core_simd/examples/matrix_inversion.rs |  2 +-
 crates/core_simd/examples/nbody.rs            |  8 +-
 crates/core_simd/examples/spectral_norm.rs    |  4 +-
 crates/core_simd/src/comparisons.rs           | 52 +++++++++++++
 crates/core_simd/src/intrinsics.rs            |  9 ++-
 crates/core_simd/src/lib.rs                   |  3 +-
 crates/core_simd/src/masks/to_bitmask.rs      |  3 +
 crates/core_simd/src/math.rs                  | 21 ++----
 crates/core_simd/src/reduction.rs             | 44 +++++------
 crates/core_simd/src/select.rs                |  6 +-
 crates/core_simd/src/swizzle.rs               | 12 +--
 crates/core_simd/src/vector.rs                | 24 ++----
 crates/core_simd/tests/i16_ops.rs             | 27 +++++++
 crates/core_simd/tests/ops_macros.rs          | 73 +++++++++++++------
 crates/core_simd/tests/round.rs               |  1 -
 crates/std_float/Cargo.toml                   |  2 +-
 crates/test_helpers/src/lib.rs                | 20 ++++-
 19 files changed, 213 insertions(+), 104 deletions(-)

diff --git a/beginners-guide.md b/beginners-guide.md
index dfd357c4592..75158e5aa85 100644
--- a/beginners-guide.md
+++ b/beginners-guide.md
@@ -33,7 +33,7 @@ SIMD has a few special vocabulary terms you should know:
 
 * **Vertical:** When an operation is "vertical", each lane processes individually without regard to the other lanes in the same vector. For example, a "vertical add" between two vectors would add lane 0 in `a` with lane 0 in `b`, with the total in lane 0 of `out`, and then the same thing for lanes 1, 2, etc. Most SIMD operations are vertical operations, so if your problem is a vertical problem then you can probably solve it with SIMD.
 
-* **Horizontal:** When an operation is "horizontal", the lanes within a single vector interact in some way. A "horizontal add" might add up lane 0 of `a` with lane 1 of `a`, with the total in lane 0 of `out`.
+* **Reducing/Reduce:** When an operation is "reducing" (functions named `reduce_*`), the lanes within a single vector are merged using some operation such as addition, returning the merged value as a scalar. For instance, a reducing add would return the sum of all the lanes' values.
 
 * **Target Feature:** Rust calls a CPU architecture extension a `target_feature`. Proper SIMD requires various CPU extensions to be enabled (details below). Don't confuse this with `feature`, which is a Cargo crate concept.
 
@@ -83,4 +83,4 @@ Fortunately, most SIMD types have a fairly predictable size. `i32x4` is bit-equi
 However, this is not the same as alignment. Computer architectures generally prefer aligned accesses, especially when moving data between memory and vector registers, and while some support specialized operations that can bend the rules to help with this, unaligned access is still typically slow, or even undefined behavior. In addition, different architectures can require different alignments when interacting with their native SIMD types. For this reason, any `#[repr(simd)]` type has a non-portable alignment. If it is necessary to directly interact with the alignment of these types, it should be via [`mem::align_of`].
 
 [`mem::transmute`]: https://doc.rust-lang.org/core/mem/fn.transmute.html
-[`mem::align_of`]: https://doc.rust-lang.org/core/mem/fn.align_of.html
\ No newline at end of file
+[`mem::align_of`]: https://doc.rust-lang.org/core/mem/fn.align_of.html
diff --git a/crates/core_simd/Cargo.toml b/crates/core_simd/Cargo.toml
index d2ff5f3b1b1..8877c6df66e 100644
--- a/crates/core_simd/Cargo.toml
+++ b/crates/core_simd/Cargo.toml
@@ -9,7 +9,7 @@ categories = ["hardware-support", "no-std"]
 license = "MIT OR Apache-2.0"
 
 [features]
-default = ["std", "generic_const_exprs"]
+default = []
 std = []
 generic_const_exprs = []
 
diff --git a/crates/core_simd/examples/matrix_inversion.rs b/crates/core_simd/examples/matrix_inversion.rs
index c51a566deb5..39f530f68f5 100644
--- a/crates/core_simd/examples/matrix_inversion.rs
+++ b/crates/core_simd/examples/matrix_inversion.rs
@@ -233,7 +233,7 @@ pub fn simd_inv4x4(m: Matrix4x4) -> Option<Matrix4x4> {
     let det = det.rotate_lanes_right::<2>() + det;
     let det = det.reverse().rotate_lanes_right::<2>() + det;
 
-    if det.horizontal_sum() == 0. {
+    if det.reduce_sum() == 0. {
         return None;
     }
     // calculate the reciprocal
diff --git a/crates/core_simd/examples/nbody.rs b/crates/core_simd/examples/nbody.rs
index b16b952f71e..df38a00967f 100644
--- a/crates/core_simd/examples/nbody.rs
+++ b/crates/core_simd/examples/nbody.rs
@@ -107,10 +107,10 @@ mod nbody {
         let mut e = 0.;
         for i in 0..N_BODIES {
             let bi = &bodies[i];
-            e += bi.mass * (bi.v * bi.v).horizontal_sum() * 0.5;
+            e += bi.mass * (bi.v * bi.v).reduce_sum() * 0.5;
             for bj in bodies.iter().take(N_BODIES).skip(i + 1) {
                 let dx = bi.x - bj.x;
-                e -= bi.mass * bj.mass / (dx * dx).horizontal_sum().sqrt()
+                e -= bi.mass * bj.mass / (dx * dx).reduce_sum().sqrt()
             }
         }
         e
@@ -134,8 +134,8 @@ mod nbody {
         let mut mag = [0.0; N];
         for i in (0..N).step_by(2) {
             let d2s = f64x2::from_array([
-                (r[i] * r[i]).horizontal_sum(),
-                (r[i + 1] * r[i + 1]).horizontal_sum(),
+                (r[i] * r[i]).reduce_sum(),
+                (r[i + 1] * r[i + 1]).reduce_sum(),
             ]);
             let dmags = f64x2::splat(dt) / (d2s * d2s.sqrt());
             mag[i] = dmags[0];
diff --git a/crates/core_simd/examples/spectral_norm.rs b/crates/core_simd/examples/spectral_norm.rs
index c515dad4dea..012182e090b 100644
--- a/crates/core_simd/examples/spectral_norm.rs
+++ b/crates/core_simd/examples/spectral_norm.rs
@@ -20,7 +20,7 @@ fn mult_av(v: &[f64], out: &mut [f64]) {
             sum += b / a;
             j += 2
         }
-        *out = sum.horizontal_sum();
+        *out = sum.reduce_sum();
     }
 }
 
@@ -38,7 +38,7 @@ fn mult_atv(v: &[f64], out: &mut [f64]) {
             sum += b / a;
             j += 2
         }
-        *out = sum.horizontal_sum();
+        *out = sum.reduce_sum();
     }
 }
 
diff --git a/crates/core_simd/src/comparisons.rs b/crates/core_simd/src/comparisons.rs
index d024cf4ddbe..7b0d0a6864b 100644
--- a/crates/core_simd/src/comparisons.rs
+++ b/crates/core_simd/src/comparisons.rs
@@ -66,3 +66,55 @@ where
         unsafe { Mask::from_int_unchecked(intrinsics::simd_ge(self, other)) }
     }
 }
+
+macro_rules! impl_ord_methods_vector {
+    { $type:ty } => {
+        impl<const LANES: usize> Simd<$type, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            /// Returns the lane-wise minimum with `other`.
+            #[must_use = "method returns a new vector and does not mutate the original value"]
+            #[inline]
+            pub fn min(self, other: Self) -> Self {
+                self.lanes_gt(other).select(other, self)
+            }
+
+            /// Returns the lane-wise maximum with `other`.
+            #[must_use = "method returns a new vector and does not mutate the original value"]
+            #[inline]
+            pub fn max(self, other: Self) -> Self {
+                self.lanes_lt(other).select(other, self)
+            }
+
+            /// Restrict each lane to a certain interval.
+            ///
+            /// For each lane, returns `max` if `self` is greater than `max`, and `min` if `self` is
+            /// less than `min`. Otherwise returns `self`.
+            ///
+            /// # Panics
+            ///
+            /// Panics if `min > max` on any lane.
+            #[must_use = "method returns a new vector and does not mutate the original value"]
+            #[inline]
+            pub fn clamp(self, min: Self, max: Self) -> Self {
+                assert!(
+                    min.lanes_le(max).all(),
+                    "each lane in `min` must be less than or equal to the corresponding lane in `max`",
+                );
+                self.max(min).min(max)
+            }
+        }
+    }
+}
+
+impl_ord_methods_vector!(i8);
+impl_ord_methods_vector!(i16);
+impl_ord_methods_vector!(i32);
+impl_ord_methods_vector!(i64);
+impl_ord_methods_vector!(isize);
+impl_ord_methods_vector!(u8);
+impl_ord_methods_vector!(u16);
+impl_ord_methods_vector!(u32);
+impl_ord_methods_vector!(u64);
+impl_ord_methods_vector!(usize);
diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index cf2c0a02351..426c4de6ab1 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -18,7 +18,6 @@
 //!
 //! Unless stated otherwise, all intrinsics for binary operations require SIMD vectors of equal types and lengths.
 
-
 // These intrinsics aren't linked directly from LLVM and are mostly undocumented, however they are
 // mostly lowered to the matching LLVM instructions by the compiler in a fairly straightforward manner.
 // The associated LLVM instruction or intrinsic is documented alongside each Rust intrinsic function.
@@ -130,6 +129,14 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_reduce_xor<T, U>(x: T) -> U;
 
     // truncate integer vector to bitmask
+    // `fn simd_bitmask(vector) -> unsigned integer` takes a vector of integers and
+    // returns either an unsigned integer or array of `u8`.
+    // Every element in the vector becomes a single bit in the returned bitmask.
+    // If the vector has less than 8 lanes, a u8 is returned with zeroed trailing bits.
+    // The bit order of the result depends on the byte endianness. LSB-first for little
+    // endian and MSB-first for big endian.
+    //
+    // UB if called on a vector with values other than 0 and -1.
     #[allow(unused)]
     pub(crate) fn simd_bitmask<T, U>(x: T) -> U;
 
diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 91ae34c05e0..2632073622e 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -1,6 +1,5 @@
-#![cfg_attr(not(feature = "std"), no_std)]
+#![no_std]
 #![feature(
-    const_fn_trait_bound,
     convert_float_to_int,
     decl_macro,
     intra_doc_pointers,
diff --git a/crates/core_simd/src/masks/to_bitmask.rs b/crates/core_simd/src/masks/to_bitmask.rs
index 1c2037764c1..c263f6a4eec 100644
--- a/crates/core_simd/src/masks/to_bitmask.rs
+++ b/crates/core_simd/src/masks/to_bitmask.rs
@@ -50,6 +50,9 @@ macro_rules! impl_integer_intrinsic {
 }
 
 impl_integer_intrinsic! {
+    unsafe impl ToBitMask<BitMask=u8> for Mask<_, 1>
+    unsafe impl ToBitMask<BitMask=u8> for Mask<_, 2>
+    unsafe impl ToBitMask<BitMask=u8> for Mask<_, 4>
     unsafe impl ToBitMask<BitMask=u8> for Mask<_, 8>
     unsafe impl ToBitMask<BitMask=u16> for Mask<_, 16>
     unsafe impl ToBitMask<BitMask=u32> for Mask<_, 32>
diff --git a/crates/core_simd/src/math.rs b/crates/core_simd/src/math.rs
index 0b4e40983af..606021e983e 100644
--- a/crates/core_simd/src/math.rs
+++ b/crates/core_simd/src/math.rs
@@ -10,8 +10,7 @@ macro_rules! impl_uint_arith {
             /// # Examples
             /// ```
             /// # #![feature(portable_simd)]
-            /// # #[cfg(feature = "std")] use core_simd::Simd;
-            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+            /// # use core::simd::Simd;
             #[doc = concat!("# use core::", stringify!($ty), "::MAX;")]
             /// let x = Simd::from_array([2, 1, 0, MAX]);
             /// let max = Simd::splat(MAX);
@@ -31,8 +30,7 @@ macro_rules! impl_uint_arith {
             /// # Examples
             /// ```
             /// # #![feature(portable_simd)]
-            /// # #[cfg(feature = "std")] use core_simd::Simd;
-            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+            /// # use core::simd::Simd;
             #[doc = concat!("# use core::", stringify!($ty), "::MAX;")]
             /// let x = Simd::from_array([2, 1, 0, MAX]);
             /// let max = Simd::splat(MAX);
@@ -58,8 +56,7 @@ macro_rules! impl_int_arith {
             /// # Examples
             /// ```
             /// # #![feature(portable_simd)]
-            /// # #[cfg(feature = "std")] use core_simd::Simd;
-            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+            /// # use core::simd::Simd;
             #[doc = concat!("# use core::", stringify!($ty), "::{MIN, MAX};")]
             /// let x = Simd::from_array([MIN, 0, 1, MAX]);
             /// let max = Simd::splat(MAX);
@@ -79,8 +76,7 @@ macro_rules! impl_int_arith {
             /// # Examples
             /// ```
             /// # #![feature(portable_simd)]
-            /// # #[cfg(feature = "std")] use core_simd::Simd;
-            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+            /// # use core::simd::Simd;
             #[doc = concat!("# use core::", stringify!($ty), "::{MIN, MAX};")]
             /// let x = Simd::from_array([MIN, -2, -1, MAX]);
             /// let max = Simd::splat(MAX);
@@ -100,8 +96,7 @@ macro_rules! impl_int_arith {
             /// # Examples
             /// ```
             /// # #![feature(portable_simd)]
-            /// # #[cfg(feature = "std")] use core_simd::Simd;
-            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+            /// # use core::simd::Simd;
             #[doc = concat!("# use core::", stringify!($ty), "::{MIN, MAX};")]
             /// let xs = Simd::from_array([MIN, MIN +1, -5, 0]);
             /// assert_eq!(xs.abs(), Simd::from_array([MIN, MAX, 5, 0]));
@@ -119,8 +114,7 @@ macro_rules! impl_int_arith {
             /// # Examples
             /// ```
             /// # #![feature(portable_simd)]
-            /// # #[cfg(feature = "std")] use core_simd::Simd;
-            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+            /// # use core::simd::Simd;
             #[doc = concat!("# use core::", stringify!($ty), "::{MIN, MAX};")]
             /// let xs = Simd::from_array([MIN, -2, 0, 3]);
             /// let unsat = xs.abs();
@@ -142,8 +136,7 @@ macro_rules! impl_int_arith {
             /// # Examples
             /// ```
             /// # #![feature(portable_simd)]
-            /// # #[cfg(feature = "std")] use core_simd::Simd;
-            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+            /// # use core::simd::Simd;
             #[doc = concat!("# use core::", stringify!($ty), "::{MIN, MAX};")]
             /// let x = Simd::from_array([MIN, -2, 3, MAX]);
             /// let unsat = -x;
diff --git a/crates/core_simd/src/reduction.rs b/crates/core_simd/src/reduction.rs
index e1cd743e442..3177fd167fc 100644
--- a/crates/core_simd/src/reduction.rs
+++ b/crates/core_simd/src/reduction.rs
@@ -11,30 +11,30 @@ macro_rules! impl_integer_reductions {
         where
             LaneCount<LANES>: SupportedLaneCount,
         {
-            /// Horizontal wrapping add.  Returns the sum of the lanes of the vector, with wrapping addition.
+            /// Reducing wrapping add.  Returns the sum of the lanes of the vector, with wrapping addition.
             #[inline]
-            pub fn horizontal_sum(self) -> $scalar {
+            pub fn reduce_sum(self) -> $scalar {
                 // Safety: `self` is an integer vector
                 unsafe { simd_reduce_add_ordered(self, 0) }
             }
 
-            /// Horizontal wrapping multiply.  Returns the product of the lanes of the vector, with wrapping multiplication.
+            /// Reducing wrapping multiply.  Returns the product of the lanes of the vector, with wrapping multiplication.
             #[inline]
-            pub fn horizontal_product(self) -> $scalar {
+            pub fn reduce_product(self) -> $scalar {
                 // Safety: `self` is an integer vector
                 unsafe { simd_reduce_mul_ordered(self, 1) }
             }
 
-            /// Horizontal maximum.  Returns the maximum lane in the vector.
+            /// Reducing maximum.  Returns the maximum lane in the vector.
             #[inline]
-            pub fn horizontal_max(self) -> $scalar {
+            pub fn reduce_max(self) -> $scalar {
                 // Safety: `self` is an integer vector
                 unsafe { simd_reduce_max(self) }
             }
 
-            /// Horizontal minimum.  Returns the minimum lane in the vector.
+            /// Reducing minimum.  Returns the minimum lane in the vector.
             #[inline]
-            pub fn horizontal_min(self) -> $scalar {
+            pub fn reduce_min(self) -> $scalar {
                 // Safety: `self` is an integer vector
                 unsafe { simd_reduce_min(self) }
             }
@@ -60,9 +60,9 @@ macro_rules! impl_float_reductions {
             LaneCount<LANES>: SupportedLaneCount,
         {
 
-            /// Horizontal add.  Returns the sum of the lanes of the vector.
+            /// Reducing add.  Returns the sum of the lanes of the vector.
             #[inline]
-            pub fn horizontal_sum(self) -> $scalar {
+            pub fn reduce_sum(self) -> $scalar {
                 // LLVM sum is inaccurate on i586
                 if cfg!(all(target_arch = "x86", not(target_feature = "sse2"))) {
                     self.as_array().iter().sum()
@@ -72,9 +72,9 @@ macro_rules! impl_float_reductions {
                 }
             }
 
-            /// Horizontal multiply.  Returns the product of the lanes of the vector.
+            /// Reducing multiply.  Returns the product of the lanes of the vector.
             #[inline]
-            pub fn horizontal_product(self) -> $scalar {
+            pub fn reduce_product(self) -> $scalar {
                 // LLVM product is inaccurate on i586
                 if cfg!(all(target_arch = "x86", not(target_feature = "sse2"))) {
                     self.as_array().iter().product()
@@ -84,22 +84,22 @@ macro_rules! impl_float_reductions {
                 }
             }
 
-            /// Horizontal maximum.  Returns the maximum lane in the vector.
+            /// Reducing maximum.  Returns the maximum lane in the vector.
             ///
             /// Returns values based on equality, so a vector containing both `0.` and `-0.` may
             /// return either.  This function will not return `NaN` unless all lanes are `NaN`.
             #[inline]
-            pub fn horizontal_max(self) -> $scalar {
+            pub fn reduce_max(self) -> $scalar {
                 // Safety: `self` is a float vector
                 unsafe { simd_reduce_max(self) }
             }
 
-            /// Horizontal minimum.  Returns the minimum lane in the vector.
+            /// Reducing minimum.  Returns the minimum lane in the vector.
             ///
             /// Returns values based on equality, so a vector containing both `0.` and `-0.` may
             /// return either.  This function will not return `NaN` unless all lanes are `NaN`.
             #[inline]
-            pub fn horizontal_min(self) -> $scalar {
+            pub fn reduce_min(self) -> $scalar {
                 // Safety: `self` is a float vector
                 unsafe { simd_reduce_min(self) }
             }
@@ -116,10 +116,10 @@ where
     T: SimdElement + BitAnd<T, Output = T>,
     LaneCount<LANES>: SupportedLaneCount,
 {
-    /// Horizontal bitwise "and".  Returns the cumulative bitwise "and" across the lanes of
+    /// Reducing bitwise "and".  Returns the cumulative bitwise "and" across the lanes of
     /// the vector.
     #[inline]
-    pub fn horizontal_and(self) -> T {
+    pub fn reduce_and(self) -> T {
         unsafe { simd_reduce_and(self) }
     }
 }
@@ -130,10 +130,10 @@ where
     T: SimdElement + BitOr<T, Output = T>,
     LaneCount<LANES>: SupportedLaneCount,
 {
-    /// Horizontal bitwise "or".  Returns the cumulative bitwise "or" across the lanes of
+    /// Reducing bitwise "or".  Returns the cumulative bitwise "or" across the lanes of
     /// the vector.
     #[inline]
-    pub fn horizontal_or(self) -> T {
+    pub fn reduce_or(self) -> T {
         unsafe { simd_reduce_or(self) }
     }
 }
@@ -144,10 +144,10 @@ where
     T: SimdElement + BitXor<T, Output = T>,
     LaneCount<LANES>: SupportedLaneCount,
 {
-    /// Horizontal bitwise "xor".  Returns the cumulative bitwise "xor" across the lanes of
+    /// Reducing bitwise "xor".  Returns the cumulative bitwise "xor" across the lanes of
     /// the vector.
     #[inline]
-    pub fn horizontal_xor(self) -> T {
+    pub fn reduce_xor(self) -> T {
         unsafe { simd_reduce_xor(self) }
     }
 }
diff --git a/crates/core_simd/src/select.rs b/crates/core_simd/src/select.rs
index 3acf07260e1..065c5987d3f 100644
--- a/crates/core_simd/src/select.rs
+++ b/crates/core_simd/src/select.rs
@@ -14,8 +14,7 @@ where
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
-    /// # #[cfg(feature = "std")] use core_simd::{Simd, Mask};
-    /// # #[cfg(not(feature = "std"))] use core::simd::{Simd, Mask};
+    /// # use core::simd::{Simd, Mask};
     /// let a = Simd::from_array([0, 1, 2, 3]);
     /// let b = Simd::from_array([4, 5, 6, 7]);
     /// let mask = Mask::from_array([true, false, false, true]);
@@ -45,8 +44,7 @@ where
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
-    /// # #[cfg(feature = "std")] use core_simd::Mask;
-    /// # #[cfg(not(feature = "std"))] use core::simd::Mask;
+    /// # use core::simd::Mask;
     /// let a = Mask::<i32, 4>::from_array([true, true, false, false]);
     /// let b = Mask::<i32, 4>::from_array([false, false, true, true]);
     /// let mask = Mask::<i32, 4>::from_array([true, false, false, true]);
diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 08b2add1166..ef47c4f3a4c 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -12,8 +12,7 @@ use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
 /// ## One source vector
 /// ```
 /// # #![feature(portable_simd)]
-/// # #[cfg(feature = "std")] use core_simd::{Simd, simd_swizzle};
-/// # #[cfg(not(feature = "std"))] use core::simd::{Simd, simd_swizzle};
+/// # use core::simd::{Simd, simd_swizzle};
 /// let v = Simd::<f32, 4>::from_array([0., 1., 2., 3.]);
 ///
 /// // Keeping the same size
@@ -28,8 +27,7 @@ use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
 /// ## Two source vectors
 /// ```
 /// # #![feature(portable_simd)]
-/// # #[cfg(feature = "std")] use core_simd::{Simd, simd_swizzle, Which};
-/// # #[cfg(not(feature = "std"))] use core::simd::{Simd, simd_swizzle, Which};
+/// # use core::simd::{Simd, simd_swizzle, Which};
 /// use Which::*;
 /// let a = Simd::<f32, 4>::from_array([0., 1., 2., 3.]);
 /// let b = Simd::<f32, 4>::from_array([4., 5., 6., 7.]);
@@ -273,8 +271,7 @@ where
     ///
     /// ```
     /// #![feature(portable_simd)]
-    /// # #[cfg(feature = "std")] use core_simd::Simd;
-    /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+    /// # use core::simd::Simd;
     /// let a = Simd::from_array([0, 1, 2, 3]);
     /// let b = Simd::from_array([4, 5, 6, 7]);
     /// let (x, y) = a.interleave(b);
@@ -337,8 +334,7 @@ where
     ///
     /// ```
     /// #![feature(portable_simd)]
-    /// # #[cfg(feature = "std")] use core_simd::Simd;
-    /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+    /// # use core::simd::Simd;
     /// let a = Simd::from_array([0, 4, 1, 5]);
     /// let b = Simd::from_array([2, 6, 3, 7]);
     /// let (x, y) = a.deinterleave(b);
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 3ccaf54b2a3..b9cd2e2021e 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -153,8 +153,7 @@ where
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
-    /// # #[cfg(feature = "std")] use core_simd::Simd;
-    /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+    /// # use core::simd::Simd;
     /// let floats: Simd<f32, 4> = Simd::from_array([1.9, -4.5, f32::INFINITY, f32::NAN]);
     /// let ints = floats.cast::<i32>();
     /// assert_eq!(ints, Simd::from_array([1, -4, i32::MAX, 0]));
@@ -180,8 +179,7 @@ where
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
-    /// # #[cfg(feature = "std")] use core_simd::Simd;
-    /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+    /// # use core::simd::Simd;
     /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
     /// let idxs = Simd::from_array([9, 3, 0, 5]);
     /// let alt = Simd::from_array([-5, -4, -3, -2]);
@@ -201,8 +199,7 @@ where
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
-    /// # #[cfg(feature = "std")] use core_simd::Simd;
-    /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+    /// # use core::simd::Simd;
     /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
     /// let idxs = Simd::from_array([9, 3, 0, 5]);
     ///
@@ -225,8 +222,7 @@ where
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
-    /// # #[cfg(feature = "std")] use core_simd::{Simd, Mask};
-    /// # #[cfg(not(feature = "std"))] use core::simd::{Simd, Mask};
+    /// # use core::simd::{Simd, Mask};
     /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
     /// let idxs = Simd::from_array([9, 3, 0, 5]);
     /// let alt = Simd::from_array([-5, -4, -3, -2]);
@@ -260,8 +256,7 @@ where
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
-    /// # #[cfg(feature = "std")] use core_simd::{Simd, Mask};
-    /// # #[cfg(not(feature = "std"))] use core::simd::{Simd, Mask};
+    /// # use core::simd::{Simd, Mask};
     /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
     /// let idxs = Simd::from_array([9, 3, 0, 5]);
     /// let alt = Simd::from_array([-5, -4, -3, -2]);
@@ -296,8 +291,7 @@ where
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
-    /// # #[cfg(feature = "std")] use core_simd::Simd;
-    /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+    /// # use core::simd::Simd;
     /// let mut vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
     /// let idxs = Simd::from_array([9, 3, 0, 0]);
     /// let vals = Simd::from_array([-27, 82, -41, 124]);
@@ -319,8 +313,7 @@ where
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
-    /// # #[cfg(feature = "std")] use core_simd::{Simd, Mask};
-    /// # #[cfg(not(feature = "std"))] use core::simd::{Simd, Mask};
+    /// # use core::simd::{Simd, Mask};
     /// let mut vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
     /// let idxs = Simd::from_array([9, 3, 0, 0]);
     /// let vals = Simd::from_array([-27, 82, -41, 124]);
@@ -354,8 +347,7 @@ where
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
-    /// # #[cfg(feature = "std")] use core_simd::{Simd, Mask};
-    /// # #[cfg(not(feature = "std"))] use core::simd::{Simd, Mask};
+    /// # use core::simd::{Simd, Mask};
     /// let mut vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
     /// let idxs = Simd::from_array([9, 3, 0, 0]);
     /// let vals = Simd::from_array([-27, 82, -41, 124]);
diff --git a/crates/core_simd/tests/i16_ops.rs b/crates/core_simd/tests/i16_ops.rs
index f6c5d74fbbc..171e5b472fa 100644
--- a/crates/core_simd/tests/i16_ops.rs
+++ b/crates/core_simd/tests/i16_ops.rs
@@ -1,5 +1,32 @@
 #![feature(portable_simd)]
+use core_simd::i16x2;
 
 #[macro_use]
 mod ops_macros;
 impl_signed_tests! { i16 }
+
+#[test]
+fn max_is_not_lexicographic() {
+    let a = i16x2::splat(10);
+    let b = i16x2::from_array([-4, 12]);
+    assert_eq!(a.max(b), i16x2::from_array([10, 12]));
+}
+
+#[test]
+fn min_is_not_lexicographic() {
+    let a = i16x2::splat(10);
+    let b = i16x2::from_array([12, -4]);
+    assert_eq!(a.min(b), i16x2::from_array([10, -4]));
+}
+
+#[test]
+fn clamp_is_not_lexicographic() {
+    let a = i16x2::splat(10);
+    let lo = i16x2::from_array([-12, -4]);
+    let up = i16x2::from_array([-4, 12]);
+    assert_eq!(a.clamp(lo, up), i16x2::from_array([-4, 10]));
+
+    let x = i16x2::from_array([1, 10]);
+    let y = x.clamp(i16x2::splat(0), i16x2::splat(9));
+    assert_eq!(y, i16x2::from_array([1, 9]));
+}
diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index 50f7a4ca170..7c9b17673ef 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -94,70 +94,70 @@ macro_rules! impl_binary_checked_op_test {
 macro_rules! impl_common_integer_tests {
     { $vector:ident, $scalar:ident } => {
         test_helpers::test_lanes! {
-            fn horizontal_sum<const LANES: usize>() {
+            fn reduce_sum<const LANES: usize>() {
                 test_helpers::test_1(&|x| {
                     test_helpers::prop_assert_biteq! (
-                        $vector::<LANES>::from_array(x).horizontal_sum(),
+                        $vector::<LANES>::from_array(x).reduce_sum(),
                         x.iter().copied().fold(0 as $scalar, $scalar::wrapping_add),
                     );
                     Ok(())
                 });
             }
 
-            fn horizontal_product<const LANES: usize>() {
+            fn reduce_product<const LANES: usize>() {
                 test_helpers::test_1(&|x| {
                     test_helpers::prop_assert_biteq! (
-                        $vector::<LANES>::from_array(x).horizontal_product(),
+                        $vector::<LANES>::from_array(x).reduce_product(),
                         x.iter().copied().fold(1 as $scalar, $scalar::wrapping_mul),
                     );
                     Ok(())
                 });
             }
 
-            fn horizontal_and<const LANES: usize>() {
+            fn reduce_and<const LANES: usize>() {
                 test_helpers::test_1(&|x| {
                     test_helpers::prop_assert_biteq! (
-                        $vector::<LANES>::from_array(x).horizontal_and(),
+                        $vector::<LANES>::from_array(x).reduce_and(),
                         x.iter().copied().fold(-1i8 as $scalar, <$scalar as core::ops::BitAnd>::bitand),
                     );
                     Ok(())
                 });
             }
 
-            fn horizontal_or<const LANES: usize>() {
+            fn reduce_or<const LANES: usize>() {
                 test_helpers::test_1(&|x| {
                     test_helpers::prop_assert_biteq! (
-                        $vector::<LANES>::from_array(x).horizontal_or(),
+                        $vector::<LANES>::from_array(x).reduce_or(),
                         x.iter().copied().fold(0 as $scalar, <$scalar as core::ops::BitOr>::bitor),
                     );
                     Ok(())
                 });
             }
 
-            fn horizontal_xor<const LANES: usize>() {
+            fn reduce_xor<const LANES: usize>() {
                 test_helpers::test_1(&|x| {
                     test_helpers::prop_assert_biteq! (
-                        $vector::<LANES>::from_array(x).horizontal_xor(),
+                        $vector::<LANES>::from_array(x).reduce_xor(),
                         x.iter().copied().fold(0 as $scalar, <$scalar as core::ops::BitXor>::bitxor),
                     );
                     Ok(())
                 });
             }
 
-            fn horizontal_max<const LANES: usize>() {
+            fn reduce_max<const LANES: usize>() {
                 test_helpers::test_1(&|x| {
                     test_helpers::prop_assert_biteq! (
-                        $vector::<LANES>::from_array(x).horizontal_max(),
+                        $vector::<LANES>::from_array(x).reduce_max(),
                         x.iter().copied().max().unwrap(),
                     );
                     Ok(())
                 });
             }
 
-            fn horizontal_min<const LANES: usize>() {
+            fn reduce_min<const LANES: usize>() {
                 test_helpers::test_1(&|x| {
                     test_helpers::prop_assert_biteq! (
-                        $vector::<LANES>::from_array(x).horizontal_min(),
+                        $vector::<LANES>::from_array(x).reduce_min(),
                         x.iter().copied().min().unwrap(),
                     );
                     Ok(())
@@ -222,6 +222,35 @@ macro_rules! impl_signed_tests {
                     assert_eq!(a % b, Vector::<LANES>::splat(0));
                 }
 
+                fn min<const LANES: usize>() {
+                    let a = Vector::<LANES>::splat(Scalar::MIN);
+                    let b = Vector::<LANES>::splat(0);
+                    assert_eq!(a.min(b), a);
+                    let a = Vector::<LANES>::splat(Scalar::MAX);
+                    let b = Vector::<LANES>::splat(0);
+                    assert_eq!(a.min(b), b);
+                }
+
+                fn max<const LANES: usize>() {
+                    let a = Vector::<LANES>::splat(Scalar::MIN);
+                    let b = Vector::<LANES>::splat(0);
+                    assert_eq!(a.max(b), b);
+                    let a = Vector::<LANES>::splat(Scalar::MAX);
+                    let b = Vector::<LANES>::splat(0);
+                    assert_eq!(a.max(b), a);
+                }
+
+                fn clamp<const LANES: usize>() {
+                    let min = Vector::<LANES>::splat(Scalar::MIN);
+                    let max = Vector::<LANES>::splat(Scalar::MAX);
+                    let zero = Vector::<LANES>::splat(0);
+                    let one = Vector::<LANES>::splat(1);
+                    let negone = Vector::<LANES>::splat(-1);
+                    assert_eq!(zero.clamp(min, max), zero);
+                    assert_eq!(zero.clamp(min, one), zero);
+                    assert_eq!(zero.clamp(one, max), one);
+                    assert_eq!(zero.clamp(min, negone), negone);
+                }
             }
 
             test_helpers::test_lanes_panic! {
@@ -499,29 +528,29 @@ macro_rules! impl_float_tests {
                     })
                 }
 
-                fn horizontal_sum<const LANES: usize>() {
+                fn reduce_sum<const LANES: usize>() {
                     test_helpers::test_1(&|x| {
                         test_helpers::prop_assert_biteq! (
-                            Vector::<LANES>::from_array(x).horizontal_sum(),
+                            Vector::<LANES>::from_array(x).reduce_sum(),
                             x.iter().sum(),
                         );
                         Ok(())
                     });
                 }
 
-                fn horizontal_product<const LANES: usize>() {
+                fn reduce_product<const LANES: usize>() {
                     test_helpers::test_1(&|x| {
                         test_helpers::prop_assert_biteq! (
-                            Vector::<LANES>::from_array(x).horizontal_product(),
+                            Vector::<LANES>::from_array(x).reduce_product(),
                             x.iter().product(),
                         );
                         Ok(())
                     });
                 }
 
-                fn horizontal_max<const LANES: usize>() {
+                fn reduce_max<const LANES: usize>() {
                     test_helpers::test_1(&|x| {
-                        let vmax = Vector::<LANES>::from_array(x).horizontal_max();
+                        let vmax = Vector::<LANES>::from_array(x).reduce_max();
                         let smax = x.iter().copied().fold(Scalar::NAN, Scalar::max);
                         // 0 and -0 are treated the same
                         if !(x.contains(&0.) && x.contains(&-0.) && vmax.abs() == 0. && smax.abs() == 0.) {
@@ -531,9 +560,9 @@ macro_rules! impl_float_tests {
                     });
                 }
 
-                fn horizontal_min<const LANES: usize>() {
+                fn reduce_min<const LANES: usize>() {
                     test_helpers::test_1(&|x| {
-                        let vmax = Vector::<LANES>::from_array(x).horizontal_min();
+                        let vmax = Vector::<LANES>::from_array(x).reduce_min();
                         let smax = x.iter().copied().fold(Scalar::NAN, Scalar::min);
                         // 0 and -0 are treated the same
                         if !(x.contains(&0.) && x.contains(&-0.) && vmax.abs() == 0. && smax.abs() == 0.) {
diff --git a/crates/core_simd/tests/round.rs b/crates/core_simd/tests/round.rs
index 53732329237..7feb0320a16 100644
--- a/crates/core_simd/tests/round.rs
+++ b/crates/core_simd/tests/round.rs
@@ -9,7 +9,6 @@ macro_rules! float_rounding_test {
             type Scalar = $scalar;
             type IntScalar = $int_scalar;
 
-            #[cfg(feature = "std")]
             test_helpers::test_lanes! {
                 fn ceil<const LANES: usize>() {
                     test_helpers::test_unary_elementwise(
diff --git a/crates/std_float/Cargo.toml b/crates/std_float/Cargo.toml
index 82f66b8dcb7..84c69774cbd 100644
--- a/crates/std_float/Cargo.toml
+++ b/crates/std_float/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-core_simd = { path = "../core_simd" }
+core_simd = { path = "../core_simd", default-features = false }
 
 [features]
 default = ["as_crate"]
diff --git a/crates/test_helpers/src/lib.rs b/crates/test_helpers/src/lib.rs
index 7edd6096381..8bf7f5ed3d2 100644
--- a/crates/test_helpers/src/lib.rs
+++ b/crates/test_helpers/src/lib.rs
@@ -77,11 +77,21 @@ impl<T: core::fmt::Debug + DefaultStrategy, const LANES: usize> DefaultStrategy
     }
 }
 
+#[cfg(not(miri))]
+fn make_runner() -> proptest::test_runner::TestRunner {
+    Default::default()
+}
+#[cfg(miri)]
+fn make_runner() -> proptest::test_runner::TestRunner {
+    // Only run a few tests on Miri
+    proptest::test_runner::TestRunner::new(proptest::test_runner::Config::with_cases(4))
+}
+
 /// Test a function that takes a single value.
 pub fn test_1<A: core::fmt::Debug + DefaultStrategy>(
     f: &dyn Fn(A) -> proptest::test_runner::TestCaseResult,
 ) {
-    let mut runner = proptest::test_runner::TestRunner::default();
+    let mut runner = make_runner();
     runner.run(&A::default_strategy(), f).unwrap();
 }
 
@@ -89,7 +99,7 @@ pub fn test_1<A: core::fmt::Debug + DefaultStrategy>(
 pub fn test_2<A: core::fmt::Debug + DefaultStrategy, B: core::fmt::Debug + DefaultStrategy>(
     f: &dyn Fn(A, B) -> proptest::test_runner::TestCaseResult,
 ) {
-    let mut runner = proptest::test_runner::TestRunner::default();
+    let mut runner = make_runner();
     runner
         .run(&(A::default_strategy(), B::default_strategy()), |(a, b)| {
             f(a, b)
@@ -105,7 +115,7 @@ pub fn test_3<
 >(
     f: &dyn Fn(A, B, C) -> proptest::test_runner::TestCaseResult,
 ) {
-    let mut runner = proptest::test_runner::TestRunner::default();
+    let mut runner = make_runner();
     runner
         .run(
             &(
@@ -361,24 +371,28 @@ macro_rules! test_lanes {
 
                 #[test]
                 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
                 fn lanes_8() {
                     implementation::<8>();
                 }
 
                 #[test]
                 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
                 fn lanes_16() {
                     implementation::<16>();
                 }
 
                 #[test]
                 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
                 fn lanes_32() {
                     implementation::<32>();
                 }
 
                 #[test]
                 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
                 fn lanes_64() {
                     implementation::<64>();
                 }

From 62e239cc51c8d568ffdcbde53470a31dfa444744 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Tue, 12 Apr 2022 11:01:22 -0400
Subject: [PATCH 09/35] portable-simd: use simd_arith_offset to avoid ptr-int
 transmutation

---
 crates/core_simd/src/intrinsics.rs |  4 ++++
 crates/core_simd/src/vector/ptr.rs | 11 +++++++++++
 2 files changed, 15 insertions(+)

diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 426c4de6ab1..82508c6882d 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -61,6 +61,10 @@ extern "platform-intrinsic" {
     /// xor
     pub(crate) fn simd_xor<T>(x: T, y: T) -> T;
 
+    /// getelementptr (without inbounds)
+    #[cfg(not(bootstrap))]
+    pub(crate) fn simd_arith_offset<T, U>(ptrs: T, offsets: U) -> T;
+
     /// fptoui/fptosi/uitofp/sitofp
     /// casting floats to integers is truncating, so it is safe to convert values like e.g. 1.5
     /// but the truncated value must fit in the target type or the result is poison.
diff --git a/crates/core_simd/src/vector/ptr.rs b/crates/core_simd/src/vector/ptr.rs
index 417d255c28d..68a9c67f795 100644
--- a/crates/core_simd/src/vector/ptr.rs
+++ b/crates/core_simd/src/vector/ptr.rs
@@ -1,5 +1,8 @@
 //! Private implementation details of public gather/scatter APIs.
+#[cfg(not(bootstrap))]
+use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+#[cfg(bootstrap)]
 use core::mem;
 
 /// A vector of *const T.
@@ -21,12 +24,16 @@ where
     #[inline]
     #[must_use]
     pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
+        #[cfg(bootstrap)]
         // Safety: converting pointers to usize and vice-versa is safe
         // (even if using that pointer is not)
         unsafe {
             let x: Simd<usize, LANES> = mem::transmute_copy(&self);
             mem::transmute_copy(&{ x + (addend * Simd::splat(mem::size_of::<T>())) })
         }
+        #[cfg(not(bootstrap))]
+        // Safety: this intrinsic doesn't have a precondition
+        unsafe { intrinsics::simd_arith_offset(self, addend) }
     }
 }
 
@@ -49,11 +56,15 @@ where
     #[inline]
     #[must_use]
     pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
+        #[cfg(bootstrap)]
         // Safety: converting pointers to usize and vice-versa is safe
         // (even if using that pointer is not)
         unsafe {
             let x: Simd<usize, LANES> = mem::transmute_copy(&self);
             mem::transmute_copy(&{ x + (addend * Simd::splat(mem::size_of::<T>())) })
         }
+        #[cfg(not(bootstrap))]
+        // Safety: this intrinsic doesn't have a precondition
+        unsafe { intrinsics::simd_arith_offset(self, addend) }
     }
 }

From 352e7b30c23948c863af1a4e62408cdf23cccf50 Mon Sep 17 00:00:00 2001
From: Mark Rousskov <mark.simulacrum@gmail.com>
Date: Fri, 20 May 2022 08:54:10 -0400
Subject: [PATCH 10/35] Finish bumping stage0

It looks like the last time had left some remaining cfg's -- which made me think
that the stage0 bump was actually successful. This brings us to a released 1.62
beta though.
---
 crates/core_simd/src/intrinsics.rs |  1 -
 crates/core_simd/src/vector/ptr.rs | 19 -------------------
 2 files changed, 20 deletions(-)

diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 82508c6882d..962c83a78cb 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -62,7 +62,6 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_xor<T>(x: T, y: T) -> T;
 
     /// getelementptr (without inbounds)
-    #[cfg(not(bootstrap))]
     pub(crate) fn simd_arith_offset<T, U>(ptrs: T, offsets: U) -> T;
 
     /// fptoui/fptosi/uitofp/sitofp
diff --git a/crates/core_simd/src/vector/ptr.rs b/crates/core_simd/src/vector/ptr.rs
index 68a9c67f795..fa756344db9 100644
--- a/crates/core_simd/src/vector/ptr.rs
+++ b/crates/core_simd/src/vector/ptr.rs
@@ -1,9 +1,6 @@
 //! Private implementation details of public gather/scatter APIs.
-#[cfg(not(bootstrap))]
 use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Simd, SupportedLaneCount};
-#[cfg(bootstrap)]
-use core::mem;
 
 /// A vector of *const T.
 #[derive(Debug, Copy, Clone)]
@@ -24,14 +21,6 @@ where
     #[inline]
     #[must_use]
     pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
-        #[cfg(bootstrap)]
-        // Safety: converting pointers to usize and vice-versa is safe
-        // (even if using that pointer is not)
-        unsafe {
-            let x: Simd<usize, LANES> = mem::transmute_copy(&self);
-            mem::transmute_copy(&{ x + (addend * Simd::splat(mem::size_of::<T>())) })
-        }
-        #[cfg(not(bootstrap))]
         // Safety: this intrinsic doesn't have a precondition
         unsafe { intrinsics::simd_arith_offset(self, addend) }
     }
@@ -56,14 +45,6 @@ where
     #[inline]
     #[must_use]
     pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
-        #[cfg(bootstrap)]
-        // Safety: converting pointers to usize and vice-versa is safe
-        // (even if using that pointer is not)
-        unsafe {
-            let x: Simd<usize, LANES> = mem::transmute_copy(&self);
-            mem::transmute_copy(&{ x + (addend * Simd::splat(mem::size_of::<T>())) })
-        }
-        #[cfg(not(bootstrap))]
         // Safety: this intrinsic doesn't have a precondition
         unsafe { intrinsics::simd_arith_offset(self, addend) }
     }

From 210275cc7555d9670a20f0b386afe138eedda91d Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Wed, 20 Jul 2022 17:57:56 -0700
Subject: [PATCH 11/35] Sync core::simd up to
 rust-lang/portable-simd@2e081db92aa3ee0a4563bc28ce01bdad5b1b2efd

---
 beginners-guide.md                       |   5 +
 crates/core_simd/Cargo.toml              |   3 +-
 crates/core_simd/src/comparisons.rs      | 120 --------
 crates/core_simd/src/elements.rs         |  11 +
 crates/core_simd/src/elements/float.rs   | 357 +++++++++++++++++++++++
 crates/core_simd/src/elements/int.rs     | 298 +++++++++++++++++++
 crates/core_simd/src/elements/uint.rs    | 139 +++++++++
 crates/core_simd/src/eq.rs               |  73 +++++
 crates/core_simd/src/lane_count.rs       |   8 +-
 crates/core_simd/src/lib.rs              |   2 +-
 crates/core_simd/src/masks.rs            |  55 ++--
 crates/core_simd/src/masks/bitmask.rs    |  20 ++
 crates/core_simd/src/masks/full_masks.rs |  85 +++++-
 crates/core_simd/src/masks/to_bitmask.rs |  61 +++-
 crates/core_simd/src/math.rs             | 156 ----------
 crates/core_simd/src/mod.rs              |  12 +-
 crates/core_simd/src/ops.rs              |  14 +-
 crates/core_simd/src/ops/unary.rs        |   1 +
 crates/core_simd/src/ord.rs              | 213 ++++++++++++++
 crates/core_simd/src/reduction.rs        | 153 ----------
 crates/core_simd/src/round.rs            |  40 ---
 crates/core_simd/src/swizzle.rs          |  52 ++--
 crates/core_simd/src/vector.rs           | 157 ++++++++--
 crates/core_simd/src/vector/float.rs     | 191 +-----------
 crates/core_simd/src/vector/int.rs       |  82 ++----
 crates/core_simd/src/vector/uint.rs      |  40 +--
 crates/core_simd/tests/i16_ops.rs        |  27 --
 crates/core_simd/tests/masks.rs          |  56 ++++
 crates/core_simd/tests/ops_macros.rs     |  48 +--
 crates/core_simd/tests/round.rs          |   2 +-
 crates/test_helpers/src/lib.rs           |   4 +-
 31 files changed, 1605 insertions(+), 880 deletions(-)
 delete mode 100644 crates/core_simd/src/comparisons.rs
 create mode 100644 crates/core_simd/src/elements.rs
 create mode 100644 crates/core_simd/src/elements/float.rs
 create mode 100644 crates/core_simd/src/elements/int.rs
 create mode 100644 crates/core_simd/src/elements/uint.rs
 create mode 100644 crates/core_simd/src/eq.rs
 delete mode 100644 crates/core_simd/src/math.rs
 create mode 100644 crates/core_simd/src/ord.rs
 delete mode 100644 crates/core_simd/src/reduction.rs
 delete mode 100644 crates/core_simd/src/round.rs

diff --git a/beginners-guide.md b/beginners-guide.md
index 75158e5aa85..17ade06ae80 100644
--- a/beginners-guide.md
+++ b/beginners-guide.md
@@ -82,5 +82,10 @@ Fortunately, most SIMD types have a fairly predictable size. `i32x4` is bit-equi
 
 However, this is not the same as alignment. Computer architectures generally prefer aligned accesses, especially when moving data between memory and vector registers, and while some support specialized operations that can bend the rules to help with this, unaligned access is still typically slow, or even undefined behavior. In addition, different architectures can require different alignments when interacting with their native SIMD types. For this reason, any `#[repr(simd)]` type has a non-portable alignment. If it is necessary to directly interact with the alignment of these types, it should be via [`mem::align_of`].
 
+When working with slices, data correctly aligned for SIMD can be acquired using the [`as_simd`] and [`as_simd_mut`] methods of the slice primitive.
+
 [`mem::transmute`]: https://doc.rust-lang.org/core/mem/fn.transmute.html
 [`mem::align_of`]: https://doc.rust-lang.org/core/mem/fn.align_of.html
+[`as_simd`]: https://doc.rust-lang.org/nightly/std/primitive.slice.html#method.as_simd
+[`as_simd_mut`]: https://doc.rust-lang.org/nightly/std/primitive.slice.html#method.as_simd_mut
+
diff --git a/crates/core_simd/Cargo.toml b/crates/core_simd/Cargo.toml
index 8877c6df66e..8a29cf15696 100644
--- a/crates/core_simd/Cargo.toml
+++ b/crates/core_simd/Cargo.toml
@@ -9,7 +9,8 @@ categories = ["hardware-support", "no-std"]
 license = "MIT OR Apache-2.0"
 
 [features]
-default = []
+default = ["as_crate"]
+as_crate = []
 std = []
 generic_const_exprs = []
 
diff --git a/crates/core_simd/src/comparisons.rs b/crates/core_simd/src/comparisons.rs
deleted file mode 100644
index 7b0d0a6864b..00000000000
--- a/crates/core_simd/src/comparisons.rs
+++ /dev/null
@@ -1,120 +0,0 @@
-use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Mask, Simd, SimdElement, SupportedLaneCount};
-
-impl<T, const LANES: usize> Simd<T, LANES>
-where
-    T: SimdElement + PartialEq,
-    LaneCount<LANES>: SupportedLaneCount,
-{
-    /// Test if each lane is equal to the corresponding lane in `other`.
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn lanes_eq(self, other: Self) -> Mask<T::Mask, LANES> {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_eq(self, other)) }
-    }
-
-    /// Test if each lane is not equal to the corresponding lane in `other`.
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn lanes_ne(self, other: Self) -> Mask<T::Mask, LANES> {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_ne(self, other)) }
-    }
-}
-
-impl<T, const LANES: usize> Simd<T, LANES>
-where
-    T: SimdElement + PartialOrd,
-    LaneCount<LANES>: SupportedLaneCount,
-{
-    /// Test if each lane is less than the corresponding lane in `other`.
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn lanes_lt(self, other: Self) -> Mask<T::Mask, LANES> {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_lt(self, other)) }
-    }
-
-    /// Test if each lane is greater than the corresponding lane in `other`.
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn lanes_gt(self, other: Self) -> Mask<T::Mask, LANES> {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_gt(self, other)) }
-    }
-
-    /// Test if each lane is less than or equal to the corresponding lane in `other`.
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn lanes_le(self, other: Self) -> Mask<T::Mask, LANES> {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_le(self, other)) }
-    }
-
-    /// Test if each lane is greater than or equal to the corresponding lane in `other`.
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn lanes_ge(self, other: Self) -> Mask<T::Mask, LANES> {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_ge(self, other)) }
-    }
-}
-
-macro_rules! impl_ord_methods_vector {
-    { $type:ty } => {
-        impl<const LANES: usize> Simd<$type, LANES>
-        where
-            LaneCount<LANES>: SupportedLaneCount,
-        {
-            /// Returns the lane-wise minimum with `other`.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn min(self, other: Self) -> Self {
-                self.lanes_gt(other).select(other, self)
-            }
-
-            /// Returns the lane-wise maximum with `other`.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn max(self, other: Self) -> Self {
-                self.lanes_lt(other).select(other, self)
-            }
-
-            /// Restrict each lane to a certain interval.
-            ///
-            /// For each lane, returns `max` if `self` is greater than `max`, and `min` if `self` is
-            /// less than `min`. Otherwise returns `self`.
-            ///
-            /// # Panics
-            ///
-            /// Panics if `min > max` on any lane.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn clamp(self, min: Self, max: Self) -> Self {
-                assert!(
-                    min.lanes_le(max).all(),
-                    "each lane in `min` must be less than or equal to the corresponding lane in `max`",
-                );
-                self.max(min).min(max)
-            }
-        }
-    }
-}
-
-impl_ord_methods_vector!(i8);
-impl_ord_methods_vector!(i16);
-impl_ord_methods_vector!(i32);
-impl_ord_methods_vector!(i64);
-impl_ord_methods_vector!(isize);
-impl_ord_methods_vector!(u8);
-impl_ord_methods_vector!(u16);
-impl_ord_methods_vector!(u32);
-impl_ord_methods_vector!(u64);
-impl_ord_methods_vector!(usize);
diff --git a/crates/core_simd/src/elements.rs b/crates/core_simd/src/elements.rs
new file mode 100644
index 00000000000..701eb66b248
--- /dev/null
+++ b/crates/core_simd/src/elements.rs
@@ -0,0 +1,11 @@
+mod float;
+mod int;
+mod uint;
+
+mod sealed {
+    pub trait Sealed {}
+}
+
+pub use float::*;
+pub use int::*;
+pub use uint::*;
diff --git a/crates/core_simd/src/elements/float.rs b/crates/core_simd/src/elements/float.rs
new file mode 100644
index 00000000000..d6022327055
--- /dev/null
+++ b/crates/core_simd/src/elements/float.rs
@@ -0,0 +1,357 @@
+use super::sealed::Sealed;
+use crate::simd::{
+    intrinsics, LaneCount, Mask, Simd, SimdElement, SimdPartialEq, SimdPartialOrd,
+    SupportedLaneCount,
+};
+
+/// Operations on SIMD vectors of floats.
+pub trait SimdFloat: Copy + Sealed {
+    /// Mask type used for manipulating this SIMD vector type.
+    type Mask;
+
+    /// Scalar type contained by this SIMD vector type.
+    type Scalar;
+
+    /// Bit representation of this SIMD vector type.
+    type Bits;
+
+    /// Raw transmutation to an unsigned integer vector type with the
+    /// same size and number of lanes.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn to_bits(self) -> Self::Bits;
+
+    /// Raw transmutation from an unsigned integer vector type with the
+    /// same size and number of lanes.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn from_bits(bits: Self::Bits) -> Self;
+
+    /// Produces a vector where every lane has the absolute value of the
+    /// equivalently-indexed lane in `self`.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn abs(self) -> Self;
+
+    /// Takes the reciprocal (inverse) of each lane, `1/x`.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn recip(self) -> Self;
+
+    /// Converts each lane from radians to degrees.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn to_degrees(self) -> Self;
+
+    /// Converts each lane from degrees to radians.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn to_radians(self) -> Self;
+
+    /// Returns true for each lane if it has a positive sign, including
+    /// `+0.0`, `NaN`s with positive sign bit and positive infinity.
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    fn is_sign_positive(self) -> Self::Mask;
+
+    /// Returns true for each lane if it has a negative sign, including
+    /// `-0.0`, `NaN`s with negative sign bit and negative infinity.
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    fn is_sign_negative(self) -> Self::Mask;
+
+    /// Returns true for each lane if its value is `NaN`.
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    fn is_nan(self) -> Self::Mask;
+
+    /// Returns true for each lane if its value is positive infinity or negative infinity.
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    fn is_infinite(self) -> Self::Mask;
+
+    /// Returns true for each lane if its value is neither infinite nor `NaN`.
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    fn is_finite(self) -> Self::Mask;
+
+    /// Returns true for each lane if its value is subnormal.
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    fn is_subnormal(self) -> Self::Mask;
+
+    /// Returns true for each lane if its value is neither zero, infinite,
+    /// subnormal, nor `NaN`.
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    fn is_normal(self) -> Self::Mask;
+
+    /// Replaces each lane with a number that represents its sign.
+    ///
+    /// * `1.0` if the number is positive, `+0.0`, or `INFINITY`
+    /// * `-1.0` if the number is negative, `-0.0`, or `NEG_INFINITY`
+    /// * `NAN` if the number is `NAN`
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn signum(self) -> Self;
+
+    /// Returns each lane with the magnitude of `self` and the sign of `sign`.
+    ///
+    /// For any lane containing a `NAN`, a `NAN` with the sign of `sign` is returned.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn copysign(self, sign: Self) -> Self;
+
+    /// Returns the minimum of each lane.
+    ///
+    /// If one of the values is `NAN`, then the other value is returned.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn simd_min(self, other: Self) -> Self;
+
+    /// Returns the maximum of each lane.
+    ///
+    /// If one of the values is `NAN`, then the other value is returned.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn simd_max(self, other: Self) -> Self;
+
+    /// Restrict each lane to a certain interval unless it is NaN.
+    ///
+    /// For each lane in `self`, returns the corresponding lane in `max` if the lane is
+    /// greater than `max`, and the corresponding lane in `min` if the lane is less
+    /// than `min`.  Otherwise returns the lane in `self`.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn simd_clamp(self, min: Self, max: Self) -> Self;
+
+    /// Returns the sum of the lanes of the vector.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{f32x2, SimdFloat};
+    /// let v = f32x2::from_array([1., 2.]);
+    /// assert_eq!(v.reduce_sum(), 3.);
+    /// ```
+    fn reduce_sum(self) -> Self::Scalar;
+
+    /// Reducing multiply.  Returns the product of the lanes of the vector.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{f32x2, SimdFloat};
+    /// let v = f32x2::from_array([3., 4.]);
+    /// assert_eq!(v.reduce_product(), 12.);
+    /// ```
+    fn reduce_product(self) -> Self::Scalar;
+
+    /// Returns the maximum lane in the vector.
+    ///
+    /// Returns values based on equality, so a vector containing both `0.` and `-0.` may
+    /// return either.
+    ///
+    /// This function will not return `NaN` unless all lanes are `NaN`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{f32x2, SimdFloat};
+    /// let v = f32x2::from_array([1., 2.]);
+    /// assert_eq!(v.reduce_max(), 2.);
+    ///
+    /// // NaN values are skipped...
+    /// let v = f32x2::from_array([1., f32::NAN]);
+    /// assert_eq!(v.reduce_max(), 1.);
+    ///
+    /// // ...unless all values are NaN
+    /// let v = f32x2::from_array([f32::NAN, f32::NAN]);
+    /// assert!(v.reduce_max().is_nan());
+    /// ```
+    fn reduce_max(self) -> Self::Scalar;
+
+    /// Returns the minimum lane in the vector.
+    ///
+    /// Returns values based on equality, so a vector containing both `0.` and `-0.` may
+    /// return either.
+    ///
+    /// This function will not return `NaN` unless all lanes are `NaN`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{f32x2, SimdFloat};
+    /// let v = f32x2::from_array([3., 7.]);
+    /// assert_eq!(v.reduce_min(), 3.);
+    ///
+    /// // NaN values are skipped...
+    /// let v = f32x2::from_array([1., f32::NAN]);
+    /// assert_eq!(v.reduce_min(), 1.);
+    ///
+    /// // ...unless all values are NaN
+    /// let v = f32x2::from_array([f32::NAN, f32::NAN]);
+    /// assert!(v.reduce_min().is_nan());
+    /// ```
+    fn reduce_min(self) -> Self::Scalar;
+}
+
+macro_rules! impl_trait {
+    { $($ty:ty { bits: $bits_ty:ty, mask: $mask_ty:ty }),* } => {
+        $(
+        impl<const LANES: usize> Sealed for Simd<$ty, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+        }
+
+        impl<const LANES: usize> SimdFloat for Simd<$ty, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            type Mask = Mask<<$mask_ty as SimdElement>::Mask, LANES>;
+            type Scalar = $ty;
+            type Bits = Simd<$bits_ty, LANES>;
+
+            #[inline]
+            fn to_bits(self) -> Simd<$bits_ty, LANES> {
+                assert_eq!(core::mem::size_of::<Self>(), core::mem::size_of::<Self::Bits>());
+                // Safety: transmuting between vector types is safe
+                unsafe { core::mem::transmute_copy(&self) }
+            }
+
+            #[inline]
+            fn from_bits(bits: Simd<$bits_ty, LANES>) -> Self {
+                assert_eq!(core::mem::size_of::<Self>(), core::mem::size_of::<Self::Bits>());
+                // Safety: transmuting between vector types is safe
+                unsafe { core::mem::transmute_copy(&bits) }
+            }
+
+            #[inline]
+            fn abs(self) -> Self {
+                // Safety: `self` is a float vector
+                unsafe { intrinsics::simd_fabs(self) }
+            }
+
+            #[inline]
+            fn recip(self) -> Self {
+                Self::splat(1.0) / self
+            }
+
+            #[inline]
+            fn to_degrees(self) -> Self {
+                // to_degrees uses a special constant for better precision, so extract that constant
+                self * Self::splat(Self::Scalar::to_degrees(1.))
+            }
+
+            #[inline]
+            fn to_radians(self) -> Self {
+                self * Self::splat(Self::Scalar::to_radians(1.))
+            }
+
+            #[inline]
+            fn is_sign_positive(self) -> Self::Mask {
+                !self.is_sign_negative()
+            }
+
+            #[inline]
+            fn is_sign_negative(self) -> Self::Mask {
+                let sign_bits = self.to_bits() & Simd::splat((!0 >> 1) + 1);
+                sign_bits.simd_gt(Simd::splat(0))
+            }
+
+            #[inline]
+            fn is_nan(self) -> Self::Mask {
+                self.simd_ne(self)
+            }
+
+            #[inline]
+            fn is_infinite(self) -> Self::Mask {
+                self.abs().simd_eq(Self::splat(Self::Scalar::INFINITY))
+            }
+
+            #[inline]
+            fn is_finite(self) -> Self::Mask {
+                self.abs().simd_lt(Self::splat(Self::Scalar::INFINITY))
+            }
+
+            #[inline]
+            fn is_subnormal(self) -> Self::Mask {
+                self.abs().simd_ne(Self::splat(0.0)) & (self.to_bits() & Self::splat(Self::Scalar::INFINITY).to_bits()).simd_eq(Simd::splat(0))
+            }
+
+            #[inline]
+            #[must_use = "method returns a new mask and does not mutate the original value"]
+            fn is_normal(self) -> Self::Mask {
+                !(self.abs().simd_eq(Self::splat(0.0)) | self.is_nan() | self.is_subnormal() | self.is_infinite())
+            }
+
+            #[inline]
+            fn signum(self) -> Self {
+                self.is_nan().select(Self::splat(Self::Scalar::NAN), Self::splat(1.0).copysign(self))
+            }
+
+            #[inline]
+            fn copysign(self, sign: Self) -> Self {
+                let sign_bit = sign.to_bits() & Self::splat(-0.).to_bits();
+                let magnitude = self.to_bits() & !Self::splat(-0.).to_bits();
+                Self::from_bits(sign_bit | magnitude)
+            }
+
+            #[inline]
+            fn simd_min(self, other: Self) -> Self {
+                // Safety: `self` and `other` are float vectors
+                unsafe { intrinsics::simd_fmin(self, other) }
+            }
+
+            #[inline]
+            fn simd_max(self, other: Self) -> Self {
+                // Safety: `self` and `other` are floating point vectors
+                unsafe { intrinsics::simd_fmax(self, other) }
+            }
+
+            #[inline]
+            fn simd_clamp(self, min: Self, max: Self) -> Self {
+                assert!(
+                    min.simd_le(max).all(),
+                    "each lane in `min` must be less than or equal to the corresponding lane in `max`",
+                );
+                let mut x = self;
+                x = x.simd_lt(min).select(min, x);
+                x = x.simd_gt(max).select(max, x);
+                x
+            }
+
+            #[inline]
+            fn reduce_sum(self) -> Self::Scalar {
+                // LLVM sum is inaccurate on i586
+                if cfg!(all(target_arch = "x86", not(target_feature = "sse2"))) {
+                    self.as_array().iter().sum()
+                } else {
+                    // Safety: `self` is a float vector
+                    unsafe { intrinsics::simd_reduce_add_ordered(self, 0.) }
+                }
+            }
+
+            #[inline]
+            fn reduce_product(self) -> Self::Scalar {
+                // LLVM product is inaccurate on i586
+                if cfg!(all(target_arch = "x86", not(target_feature = "sse2"))) {
+                    self.as_array().iter().product()
+                } else {
+                    // Safety: `self` is a float vector
+                    unsafe { intrinsics::simd_reduce_mul_ordered(self, 1.) }
+                }
+            }
+
+            #[inline]
+            fn reduce_max(self) -> Self::Scalar {
+                // Safety: `self` is a float vector
+                unsafe { intrinsics::simd_reduce_max(self) }
+            }
+
+            #[inline]
+            fn reduce_min(self) -> Self::Scalar {
+                // Safety: `self` is a float vector
+                unsafe { intrinsics::simd_reduce_min(self) }
+            }
+        }
+        )*
+    }
+}
+
+impl_trait! { f32 { bits: u32, mask: i32 }, f64 { bits: u64, mask: i64 } }
diff --git a/crates/core_simd/src/elements/int.rs b/crates/core_simd/src/elements/int.rs
new file mode 100644
index 00000000000..9b8c37ed466
--- /dev/null
+++ b/crates/core_simd/src/elements/int.rs
@@ -0,0 +1,298 @@
+use super::sealed::Sealed;
+use crate::simd::{
+    intrinsics, LaneCount, Mask, Simd, SimdElement, SimdPartialOrd, SupportedLaneCount,
+};
+
+/// Operations on SIMD vectors of signed integers.
+pub trait SimdInt: Copy + Sealed {
+    /// Mask type used for manipulating this SIMD vector type.
+    type Mask;
+
+    /// Scalar type contained by this SIMD vector type.
+    type Scalar;
+
+    /// Lanewise saturating add.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, SimdInt};
+    /// use core::i32::{MIN, MAX};
+    /// let x = Simd::from_array([MIN, 0, 1, MAX]);
+    /// let max = Simd::splat(MAX);
+    /// let unsat = x + max;
+    /// let sat = x.saturating_add(max);
+    /// assert_eq!(unsat, Simd::from_array([-1, MAX, MIN, -2]));
+    /// assert_eq!(sat, Simd::from_array([-1, MAX, MAX, MAX]));
+    /// ```
+    fn saturating_add(self, second: Self) -> Self;
+
+    /// Lanewise saturating subtract.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, SimdInt};
+    /// use core::i32::{MIN, MAX};
+    /// let x = Simd::from_array([MIN, -2, -1, MAX]);
+    /// let max = Simd::splat(MAX);
+    /// let unsat = x - max;
+    /// let sat = x.saturating_sub(max);
+    /// assert_eq!(unsat, Simd::from_array([1, MAX, MIN, 0]));
+    /// assert_eq!(sat, Simd::from_array([MIN, MIN, MIN, 0]));
+    fn saturating_sub(self, second: Self) -> Self;
+
+    /// Lanewise absolute value, implemented in Rust.
+    /// Every lane becomes its absolute value.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, SimdInt};
+    /// use core::i32::{MIN, MAX};
+    /// let xs = Simd::from_array([MIN, MIN +1, -5, 0]);
+    /// assert_eq!(xs.abs(), Simd::from_array([MIN, MAX, 5, 0]));
+    /// ```
+    fn abs(self) -> Self;
+
+    /// Lanewise saturating absolute value, implemented in Rust.
+    /// As abs(), except the MIN value becomes MAX instead of itself.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, SimdInt};
+    /// use core::i32::{MIN, MAX};
+    /// let xs = Simd::from_array([MIN, -2, 0, 3]);
+    /// let unsat = xs.abs();
+    /// let sat = xs.saturating_abs();
+    /// assert_eq!(unsat, Simd::from_array([MIN, 2, 0, 3]));
+    /// assert_eq!(sat, Simd::from_array([MAX, 2, 0, 3]));
+    /// ```
+    fn saturating_abs(self) -> Self;
+
+    /// Lanewise saturating negation, implemented in Rust.
+    /// As neg(), except the MIN value becomes MAX instead of itself.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, SimdInt};
+    /// use core::i32::{MIN, MAX};
+    /// let x = Simd::from_array([MIN, -2, 3, MAX]);
+    /// let unsat = -x;
+    /// let sat = x.saturating_neg();
+    /// assert_eq!(unsat, Simd::from_array([MIN, 2, -3, MIN + 1]));
+    /// assert_eq!(sat, Simd::from_array([MAX, 2, -3, MIN + 1]));
+    /// ```
+    fn saturating_neg(self) -> Self;
+
+    /// Returns true for each positive lane and false if it is zero or negative.
+    fn is_positive(self) -> Self::Mask;
+
+    /// Returns true for each negative lane and false if it is zero or positive.
+    fn is_negative(self) -> Self::Mask;
+
+    /// Returns numbers representing the sign of each lane.
+    /// * `0` if the number is zero
+    /// * `1` if the number is positive
+    /// * `-1` if the number is negative
+    fn signum(self) -> Self;
+
+    /// Returns the sum of the lanes of the vector, with wrapping addition.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{i32x4, SimdInt};
+    /// let v = i32x4::from_array([1, 2, 3, 4]);
+    /// assert_eq!(v.reduce_sum(), 10);
+    ///
+    /// // SIMD integer addition is always wrapping
+    /// let v = i32x4::from_array([i32::MAX, 1, 0, 0]);
+    /// assert_eq!(v.reduce_sum(), i32::MIN);
+    /// ```
+    fn reduce_sum(self) -> Self::Scalar;
+
+    /// Returns the product of the lanes of the vector, with wrapping multiplication.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{i32x4, SimdInt};
+    /// let v = i32x4::from_array([1, 2, 3, 4]);
+    /// assert_eq!(v.reduce_product(), 24);
+    ///
+    /// // SIMD integer multiplication is always wrapping
+    /// let v = i32x4::from_array([i32::MAX, 2, 1, 1]);
+    /// assert!(v.reduce_product() < i32::MAX);
+    /// ```
+    fn reduce_product(self) -> Self::Scalar;
+
+    /// Returns the maximum lane in the vector.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{i32x4, SimdInt};
+    /// let v = i32x4::from_array([1, 2, 3, 4]);
+    /// assert_eq!(v.reduce_max(), 4);
+    /// ```
+    fn reduce_max(self) -> Self::Scalar;
+
+    /// Returns the minimum lane in the vector.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{i32x4, SimdInt};
+    /// let v = i32x4::from_array([1, 2, 3, 4]);
+    /// assert_eq!(v.reduce_min(), 1);
+    /// ```
+    fn reduce_min(self) -> Self::Scalar;
+
+    /// Returns the cumulative bitwise "and" across the lanes of the vector.
+    fn reduce_and(self) -> Self::Scalar;
+
+    /// Returns the cumulative bitwise "or" across the lanes of the vector.
+    fn reduce_or(self) -> Self::Scalar;
+
+    /// Returns the cumulative bitwise "xor" across the lanes of the vector.
+    fn reduce_xor(self) -> Self::Scalar;
+}
+
+macro_rules! impl_trait {
+    { $($ty:ty),* } => {
+        $(
+        impl<const LANES: usize> Sealed for Simd<$ty, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+        }
+
+        impl<const LANES: usize> SimdInt for Simd<$ty, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            type Mask = Mask<<$ty as SimdElement>::Mask, LANES>;
+            type Scalar = $ty;
+
+            #[inline]
+            fn saturating_add(self, second: Self) -> Self {
+                // Safety: `self` is a vector
+                unsafe { intrinsics::simd_saturating_add(self, second) }
+            }
+
+            #[inline]
+            fn saturating_sub(self, second: Self) -> Self {
+                // Safety: `self` is a vector
+                unsafe { intrinsics::simd_saturating_sub(self, second) }
+            }
+
+            #[inline]
+            fn abs(self) -> Self {
+                const SHR: $ty = <$ty>::BITS as $ty - 1;
+                let m = self >> Simd::splat(SHR);
+                (self^m) - m
+            }
+
+            #[inline]
+            fn saturating_abs(self) -> Self {
+                // arith shift for -1 or 0 mask based on sign bit, giving 2s complement
+                const SHR: $ty = <$ty>::BITS as $ty - 1;
+                let m = self >> Simd::splat(SHR);
+                (self^m).saturating_sub(m)
+            }
+
+            #[inline]
+            fn saturating_neg(self) -> Self {
+                Self::splat(0).saturating_sub(self)
+            }
+
+            #[inline]
+            fn is_positive(self) -> Self::Mask {
+                self.simd_gt(Self::splat(0))
+            }
+
+            #[inline]
+            fn is_negative(self) -> Self::Mask {
+                self.simd_lt(Self::splat(0))
+            }
+
+            #[inline]
+            fn signum(self) -> Self {
+                self.is_positive().select(
+                    Self::splat(1),
+                    self.is_negative().select(Self::splat(-1), Self::splat(0))
+                )
+            }
+
+            #[inline]
+            fn reduce_sum(self) -> Self::Scalar {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_reduce_add_ordered(self, 0) }
+            }
+
+            #[inline]
+            fn reduce_product(self) -> Self::Scalar {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_reduce_mul_ordered(self, 1) }
+            }
+
+            #[inline]
+            fn reduce_max(self) -> Self::Scalar {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_reduce_max(self) }
+            }
+
+            #[inline]
+            fn reduce_min(self) -> Self::Scalar {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_reduce_min(self) }
+            }
+
+            #[inline]
+            fn reduce_and(self) -> Self::Scalar {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_reduce_and(self) }
+            }
+
+            #[inline]
+            fn reduce_or(self) -> Self::Scalar {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_reduce_or(self) }
+            }
+
+            #[inline]
+            fn reduce_xor(self) -> Self::Scalar {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_reduce_xor(self) }
+            }
+        }
+        )*
+    }
+}
+
+impl_trait! { i8, i16, i32, i64, isize }
diff --git a/crates/core_simd/src/elements/uint.rs b/crates/core_simd/src/elements/uint.rs
new file mode 100644
index 00000000000..21e7e76eb3d
--- /dev/null
+++ b/crates/core_simd/src/elements/uint.rs
@@ -0,0 +1,139 @@
+use super::sealed::Sealed;
+use crate::simd::{intrinsics, LaneCount, Simd, SupportedLaneCount};
+
+/// Operations on SIMD vectors of unsigned integers.
+pub trait SimdUint: Copy + Sealed {
+    /// Scalar type contained by this SIMD vector type.
+    type Scalar;
+
+    /// Lanewise saturating add.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, SimdUint};
+    /// use core::u32::MAX;
+    /// let x = Simd::from_array([2, 1, 0, MAX]);
+    /// let max = Simd::splat(MAX);
+    /// let unsat = x + max;
+    /// let sat = x.saturating_add(max);
+    /// assert_eq!(unsat, Simd::from_array([1, 0, MAX, MAX - 1]));
+    /// assert_eq!(sat, max);
+    /// ```
+    fn saturating_add(self, second: Self) -> Self;
+
+    /// Lanewise saturating subtract.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, SimdUint};
+    /// use core::u32::MAX;
+    /// let x = Simd::from_array([2, 1, 0, MAX]);
+    /// let max = Simd::splat(MAX);
+    /// let unsat = x - max;
+    /// let sat = x.saturating_sub(max);
+    /// assert_eq!(unsat, Simd::from_array([3, 2, 1, 0]));
+    /// assert_eq!(sat, Simd::splat(0));
+    fn saturating_sub(self, second: Self) -> Self;
+
+    /// Returns the sum of the lanes of the vector, with wrapping addition.
+    fn reduce_sum(self) -> Self::Scalar;
+
+    /// Returns the product of the lanes of the vector, with wrapping multiplication.
+    fn reduce_product(self) -> Self::Scalar;
+
+    /// Returns the maximum lane in the vector.
+    fn reduce_max(self) -> Self::Scalar;
+
+    /// Returns the minimum lane in the vector.
+    fn reduce_min(self) -> Self::Scalar;
+
+    /// Returns the cumulative bitwise "and" across the lanes of the vector.
+    fn reduce_and(self) -> Self::Scalar;
+
+    /// Returns the cumulative bitwise "or" across the lanes of the vector.
+    fn reduce_or(self) -> Self::Scalar;
+
+    /// Returns the cumulative bitwise "xor" across the lanes of the vector.
+    fn reduce_xor(self) -> Self::Scalar;
+}
+
+macro_rules! impl_trait {
+    { $($ty:ty),* } => {
+        $(
+        impl<const LANES: usize> Sealed for Simd<$ty, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+        }
+
+        impl<const LANES: usize> SimdUint for Simd<$ty, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            type Scalar = $ty;
+
+            #[inline]
+            fn saturating_add(self, second: Self) -> Self {
+                // Safety: `self` is a vector
+                unsafe { intrinsics::simd_saturating_add(self, second) }
+            }
+
+            #[inline]
+            fn saturating_sub(self, second: Self) -> Self {
+                // Safety: `self` is a vector
+                unsafe { intrinsics::simd_saturating_sub(self, second) }
+            }
+
+            #[inline]
+            fn reduce_sum(self) -> Self::Scalar {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_reduce_add_ordered(self, 0) }
+            }
+
+            #[inline]
+            fn reduce_product(self) -> Self::Scalar {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_reduce_mul_ordered(self, 1) }
+            }
+
+            #[inline]
+            fn reduce_max(self) -> Self::Scalar {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_reduce_max(self) }
+            }
+
+            #[inline]
+            fn reduce_min(self) -> Self::Scalar {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_reduce_min(self) }
+            }
+
+            #[inline]
+            fn reduce_and(self) -> Self::Scalar {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_reduce_and(self) }
+            }
+
+            #[inline]
+            fn reduce_or(self) -> Self::Scalar {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_reduce_or(self) }
+            }
+
+            #[inline]
+            fn reduce_xor(self) -> Self::Scalar {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_reduce_xor(self) }
+            }
+        }
+        )*
+    }
+}
+
+impl_trait! { u8, u16, u32, u64, usize }
diff --git a/crates/core_simd/src/eq.rs b/crates/core_simd/src/eq.rs
new file mode 100644
index 00000000000..c7111f720a8
--- /dev/null
+++ b/crates/core_simd/src/eq.rs
@@ -0,0 +1,73 @@
+use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdElement, SupportedLaneCount};
+
+/// Parallel `PartialEq`.
+pub trait SimdPartialEq {
+    /// The mask type returned by each comparison.
+    type Mask;
+
+    /// Test if each lane is equal to the corresponding lane in `other`.
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    fn simd_eq(self, other: Self) -> Self::Mask;
+
+    /// Test if each lane is equal to the corresponding lane in `other`.
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    fn simd_ne(self, other: Self) -> Self::Mask;
+}
+
+macro_rules! impl_number {
+    { $($number:ty),* } => {
+        $(
+        impl<const LANES: usize> SimdPartialEq for Simd<$number, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            type Mask = Mask<<$number as SimdElement>::Mask, LANES>;
+
+            #[inline]
+            fn simd_eq(self, other: Self) -> Self::Mask {
+                // Safety: `self` is a vector, and the result of the comparison
+                // is always a valid mask.
+                unsafe { Mask::from_int_unchecked(intrinsics::simd_eq(self, other)) }
+            }
+
+            #[inline]
+            fn simd_ne(self, other: Self) -> Self::Mask {
+                // Safety: `self` is a vector, and the result of the comparison
+                // is always a valid mask.
+                unsafe { Mask::from_int_unchecked(intrinsics::simd_ne(self, other)) }
+            }
+        }
+        )*
+    }
+}
+
+impl_number! { f32, f64, u8, u16, u32, u64, usize, i8, i16, i32, i64, isize }
+
+macro_rules! impl_mask {
+    { $($integer:ty),* } => {
+        $(
+        impl<const LANES: usize> SimdPartialEq for Mask<$integer, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            type Mask = Self;
+
+            #[inline]
+            fn simd_eq(self, other: Self) -> Self::Mask {
+                // Safety: `self` is a vector, and the result of the comparison
+                // is always a valid mask.
+                unsafe { Self::from_int_unchecked(intrinsics::simd_eq(self.to_int(), other.to_int())) }
+            }
+
+            #[inline]
+            fn simd_ne(self, other: Self) -> Self::Mask {
+                // Safety: `self` is a vector, and the result of the comparison
+                // is always a valid mask.
+                unsafe { Self::from_int_unchecked(intrinsics::simd_ne(self.to_int(), other.to_int())) }
+            }
+        }
+        )*
+    }
+}
+
+impl_mask! { i8, i16, i32, i64, isize }
diff --git a/crates/core_simd/src/lane_count.rs b/crates/core_simd/src/lane_count.rs
index 3b316f12b3e..63723e2ec13 100644
--- a/crates/core_simd/src/lane_count.rs
+++ b/crates/core_simd/src/lane_count.rs
@@ -3,7 +3,7 @@ mod sealed {
 }
 use sealed::Sealed;
 
-/// A type representing a vector lane count.
+/// Specifies the number of lanes in a SIMD vector as a type.
 pub struct LaneCount<const LANES: usize>;
 
 impl<const LANES: usize> LaneCount<LANES> {
@@ -11,7 +11,11 @@ impl<const LANES: usize> LaneCount<LANES> {
     pub const BITMASK_LEN: usize = (LANES + 7) / 8;
 }
 
-/// Helper trait for vector lane counts.
+/// Statically guarantees that a lane count is marked as supported.
+///
+/// This trait is *sealed*: the list of implementors below is total.
+/// Users do not have the ability to mark additional `LaneCount<N>` values as supported.
+/// Only SIMD vectors with supported lane counts are constructable.
 pub trait SupportedLaneCount: Sealed {
     #[doc(hidden)]
     type BitMask: Copy + Default + AsRef<[u8]> + AsMut<[u8]>;
diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 2632073622e..715f258f617 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -12,7 +12,7 @@
 #![cfg_attr(feature = "generic_const_exprs", feature(generic_const_exprs))]
 #![cfg_attr(feature = "generic_const_exprs", allow(incomplete_features))]
 #![warn(missing_docs)]
-#![deny(unsafe_op_in_unsafe_fn)]
+#![deny(unsafe_op_in_unsafe_fn, clippy::undocumented_unsafe_blocks)]
 #![unstable(feature = "portable_simd", issue = "86656")]
 //! Portable SIMD module.
 
diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index e1cd7930450..c36c336d8a2 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -15,7 +15,10 @@ mod mask_impl;
 mod to_bitmask;
 pub use to_bitmask::ToBitMask;
 
-use crate::simd::{intrinsics, LaneCount, Simd, SimdElement, SupportedLaneCount};
+#[cfg(feature = "generic_const_exprs")]
+pub use to_bitmask::{bitmask_len, ToBitMaskArray};
+
+use crate::simd::{intrinsics, LaneCount, Simd, SimdElement, SimdPartialEq, SupportedLaneCount};
 use core::cmp::Ordering;
 use core::{fmt, mem};
 
@@ -56,7 +59,7 @@ macro_rules! impl_element {
             where
                 LaneCount<LANES>: SupportedLaneCount,
             {
-                (value.lanes_eq(Simd::splat(0)) | value.lanes_eq(Simd::splat(-1))).all()
+                (value.simd_eq(Simd::splat(0 as _)) | value.simd_eq(Simd::splat(-1 as _))).all()
             }
 
             fn eq(self, other: Self) -> bool { self == other }
@@ -65,6 +68,7 @@ macro_rules! impl_element {
             const FALSE: Self = 0;
         }
 
+        // Safety: this is a valid mask element type
         unsafe impl MaskElement for $ty {}
     }
 }
@@ -77,6 +81,8 @@ impl_element! { isize }
 
 /// A SIMD vector mask for `LANES` elements of width specified by `Element`.
 ///
+/// Masks represent boolean inclusion/exclusion on a per-lane basis.
+///
 /// The layout of this type is unspecified.
 #[repr(transparent)]
 pub struct Mask<T, const LANES: usize>(mask_impl::Mask<T, LANES>)
@@ -179,6 +185,13 @@ where
         self.0.to_int()
     }
 
+    /// Converts the mask to a mask of any other lane size.
+    #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    pub fn cast<U: MaskElement>(self) -> Mask<U, LANES> {
+        Mask(self.0.convert())
+    }
+
     /// Tests the value of the specified lane.
     ///
     /// # Safety
@@ -507,58 +520,58 @@ where
     }
 }
 
-/// Vector of eight 8-bit masks
+/// A mask for SIMD vectors with eight elements of 8 bits.
 pub type mask8x8 = Mask<i8, 8>;
 
-/// Vector of 16 8-bit masks
+/// A mask for SIMD vectors with 16 elements of 8 bits.
 pub type mask8x16 = Mask<i8, 16>;
 
-/// Vector of 32 8-bit masks
+/// A mask for SIMD vectors with 32 elements of 8 bits.
 pub type mask8x32 = Mask<i8, 32>;
 
-/// Vector of 16 8-bit masks
+/// A mask for SIMD vectors with 64 elements of 8 bits.
 pub type mask8x64 = Mask<i8, 64>;
 
-/// Vector of four 16-bit masks
+/// A mask for SIMD vectors with four elements of 16 bits.
 pub type mask16x4 = Mask<i16, 4>;
 
-/// Vector of eight 16-bit masks
+/// A mask for SIMD vectors with eight elements of 16 bits.
 pub type mask16x8 = Mask<i16, 8>;
 
-/// Vector of 16 16-bit masks
+/// A mask for SIMD vectors with 16 elements of 16 bits.
 pub type mask16x16 = Mask<i16, 16>;
 
-/// Vector of 32 16-bit masks
+/// A mask for SIMD vectors with 32 elements of 16 bits.
 pub type mask16x32 = Mask<i16, 32>;
 
-/// Vector of two 32-bit masks
+/// A mask for SIMD vectors with two elements of 32 bits.
 pub type mask32x2 = Mask<i32, 2>;
 
-/// Vector of four 32-bit masks
+/// A mask for SIMD vectors with four elements of 32 bits.
 pub type mask32x4 = Mask<i32, 4>;
 
-/// Vector of eight 32-bit masks
+/// A mask for SIMD vectors with eight elements of 32 bits.
 pub type mask32x8 = Mask<i32, 8>;
 
-/// Vector of 16 32-bit masks
+/// A mask for SIMD vectors with 16 elements of 32 bits.
 pub type mask32x16 = Mask<i32, 16>;
 
-/// Vector of two 64-bit masks
+/// A mask for SIMD vectors with two elements of 64 bits.
 pub type mask64x2 = Mask<i64, 2>;
 
-/// Vector of four 64-bit masks
+/// A mask for SIMD vectors with four elements of 64 bits.
 pub type mask64x4 = Mask<i64, 4>;
 
-/// Vector of eight 64-bit masks
+/// A mask for SIMD vectors with eight elements of 64 bits.
 pub type mask64x8 = Mask<i64, 8>;
 
-/// Vector of two pointer-width masks
+/// A mask for SIMD vectors with two elements of pointer width.
 pub type masksizex2 = Mask<isize, 2>;
 
-/// Vector of four pointer-width masks
+/// A mask for SIMD vectors with four elements of pointer width.
 pub type masksizex4 = Mask<isize, 4>;
 
-/// Vector of eight pointer-width masks
+/// A mask for SIMD vectors with eight elements of pointer width.
 pub type masksizex8 = Mask<isize, 8>;
 
 macro_rules! impl_from {
@@ -569,7 +582,7 @@ macro_rules! impl_from {
             LaneCount<LANES>: SupportedLaneCount,
         {
             fn from(value: Mask<$from, LANES>) -> Self {
-                Self(value.0.convert())
+                value.cast()
             }
         }
         )*
diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index ec4dd357ee9..365ecc0a325 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -115,6 +115,26 @@ where
         unsafe { Self(intrinsics::simd_bitmask(value), PhantomData) }
     }
 
+    #[cfg(feature = "generic_const_exprs")]
+    #[inline]
+    #[must_use = "method returns a new array and does not mutate the original value"]
+    pub fn to_bitmask_array<const N: usize>(self) -> [u8; N] {
+        assert!(core::mem::size_of::<Self>() == N);
+
+        // Safety: converting an integer to an array of bytes of the same size is safe
+        unsafe { core::mem::transmute_copy(&self.0) }
+    }
+
+    #[cfg(feature = "generic_const_exprs")]
+    #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    pub fn from_bitmask_array<const N: usize>(bitmask: [u8; N]) -> Self {
+        assert!(core::mem::size_of::<Self>() == N);
+
+        // Safety: converting an array of bytes to an integer of the same size is safe
+        Self(unsafe { core::mem::transmute_copy(&bitmask) }, PhantomData)
+    }
+
     #[inline]
     pub fn to_bitmask_integer<U>(self) -> U
     where
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index 8bbdf637de8..adf0fcbeae2 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -4,6 +4,9 @@ use super::MaskElement;
 use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Simd, SupportedLaneCount, ToBitMask};
 
+#[cfg(feature = "generic_const_exprs")]
+use crate::simd::ToBitMaskArray;
+
 #[repr(transparent)]
 pub struct Mask<T, const LANES: usize>(Simd<T, LANES>)
 where
@@ -68,14 +71,26 @@ where
 
 // Used for bitmask bit order workaround
 pub(crate) trait ReverseBits {
-    fn reverse_bits(self) -> Self;
+    // Reverse the least significant `n` bits of `self`.
+    // (Remaining bits must be 0.)
+    fn reverse_bits(self, n: usize) -> Self;
 }
 
 macro_rules! impl_reverse_bits {
     { $($int:ty),* } => {
         $(
         impl ReverseBits for $int {
-            fn reverse_bits(self) -> Self { <$int>::reverse_bits(self) }
+            #[inline(always)]
+            fn reverse_bits(self, n: usize) -> Self {
+                let rev = <$int>::reverse_bits(self);
+                let bitsize = core::mem::size_of::<$int>() * 8;
+                if n < bitsize {
+                    // Shift things back to the right
+                    rev >> (bitsize - n)
+                } else {
+                    rev
+                }
+            }
         }
         )*
     }
@@ -127,6 +142,68 @@ where
         unsafe { Mask(intrinsics::simd_cast(self.0)) }
     }
 
+    #[cfg(feature = "generic_const_exprs")]
+    #[inline]
+    #[must_use = "method returns a new array and does not mutate the original value"]
+    pub fn to_bitmask_array<const N: usize>(self) -> [u8; N]
+    where
+        super::Mask<T, LANES>: ToBitMaskArray,
+        [(); <super::Mask<T, LANES> as ToBitMaskArray>::BYTES]: Sized,
+    {
+        assert_eq!(<super::Mask<T, LANES> as ToBitMaskArray>::BYTES, N);
+
+        // Safety: N is the correct bitmask size
+        unsafe {
+            // Compute the bitmask
+            let bitmask: [u8; <super::Mask<T, LANES> as ToBitMaskArray>::BYTES] =
+                intrinsics::simd_bitmask(self.0);
+
+            // Transmute to the return type, previously asserted to be the same size
+            let mut bitmask: [u8; N] = core::mem::transmute_copy(&bitmask);
+
+            // LLVM assumes bit order should match endianness
+            if cfg!(target_endian = "big") {
+                for x in bitmask.as_mut() {
+                    *x = x.reverse_bits();
+                }
+            };
+
+            bitmask
+        }
+    }
+
+    #[cfg(feature = "generic_const_exprs")]
+    #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    pub fn from_bitmask_array<const N: usize>(mut bitmask: [u8; N]) -> Self
+    where
+        super::Mask<T, LANES>: ToBitMaskArray,
+        [(); <super::Mask<T, LANES> as ToBitMaskArray>::BYTES]: Sized,
+    {
+        assert_eq!(<super::Mask<T, LANES> as ToBitMaskArray>::BYTES, N);
+
+        // Safety: N is the correct bitmask size
+        unsafe {
+            // LLVM assumes bit order should match endianness
+            if cfg!(target_endian = "big") {
+                for x in bitmask.as_mut() {
+                    *x = x.reverse_bits();
+                }
+            }
+
+            // Transmute to the bitmask type, previously asserted to be the same size
+            let bitmask: [u8; <super::Mask<T, LANES> as ToBitMaskArray>::BYTES] =
+                core::mem::transmute_copy(&bitmask);
+
+            // Compute the regular mask
+            Self::from_int_unchecked(intrinsics::simd_select_bitmask(
+                bitmask,
+                Self::splat(true).to_int(),
+                Self::splat(false).to_int(),
+            ))
+        }
+    }
+
     #[inline]
     pub(crate) fn to_bitmask_integer<U: ReverseBits>(self) -> U
     where
@@ -137,7 +214,7 @@ where
 
         // LLVM assumes bit order should match endianness
         if cfg!(target_endian = "big") {
-            bitmask.reverse_bits()
+            bitmask.reverse_bits(LANES)
         } else {
             bitmask
         }
@@ -150,7 +227,7 @@ where
     {
         // LLVM assumes bit order should match endianness
         let bitmask = if cfg!(target_endian = "big") {
-            bitmask.reverse_bits()
+            bitmask.reverse_bits(LANES)
         } else {
             bitmask
         };
diff --git a/crates/core_simd/src/masks/to_bitmask.rs b/crates/core_simd/src/masks/to_bitmask.rs
index c263f6a4eec..65d3ce9be65 100644
--- a/crates/core_simd/src/masks/to_bitmask.rs
+++ b/crates/core_simd/src/masks/to_bitmask.rs
@@ -16,11 +16,7 @@ where
 /// Converts masks to and from integer bitmasks.
 ///
 /// Each bit of the bitmask corresponds to a mask lane, starting with the LSB.
-///
-/// # Safety
-/// This trait is `unsafe` and sealed, since the `BitMask` type must match the number of lanes in
-/// the mask.
-pub unsafe trait ToBitMask: Sealed {
+pub trait ToBitMask: Sealed {
     /// The integer bitmask type.
     type BitMask;
 
@@ -31,10 +27,25 @@ pub unsafe trait ToBitMask: Sealed {
     fn from_bitmask(bitmask: Self::BitMask) -> Self;
 }
 
+/// Converts masks to and from byte array bitmasks.
+///
+/// Each bit of the bitmask corresponds to a mask lane, starting with the LSB of the first byte.
+#[cfg(feature = "generic_const_exprs")]
+pub trait ToBitMaskArray: Sealed {
+    /// The length of the bitmask array.
+    const BYTES: usize;
+
+    /// Converts a mask to a bitmask.
+    fn to_bitmask_array(self) -> [u8; Self::BYTES];
+
+    /// Converts a bitmask to a mask.
+    fn from_bitmask_array(bitmask: [u8; Self::BYTES]) -> Self;
+}
+
 macro_rules! impl_integer_intrinsic {
-    { $(unsafe impl ToBitMask<BitMask=$int:ty> for Mask<_, $lanes:literal>)* } => {
+    { $(impl ToBitMask<BitMask=$int:ty> for Mask<_, $lanes:literal>)* } => {
         $(
-        unsafe impl<T: MaskElement> ToBitMask for Mask<T, $lanes> {
+        impl<T: MaskElement> ToBitMask for Mask<T, $lanes> {
             type BitMask = $int;
 
             fn to_bitmask(self) -> $int {
@@ -50,11 +61,33 @@ macro_rules! impl_integer_intrinsic {
 }
 
 impl_integer_intrinsic! {
-    unsafe impl ToBitMask<BitMask=u8> for Mask<_, 1>
-    unsafe impl ToBitMask<BitMask=u8> for Mask<_, 2>
-    unsafe impl ToBitMask<BitMask=u8> for Mask<_, 4>
-    unsafe impl ToBitMask<BitMask=u8> for Mask<_, 8>
-    unsafe impl ToBitMask<BitMask=u16> for Mask<_, 16>
-    unsafe impl ToBitMask<BitMask=u32> for Mask<_, 32>
-    unsafe impl ToBitMask<BitMask=u64> for Mask<_, 64>
+    impl ToBitMask<BitMask=u8> for Mask<_, 1>
+    impl ToBitMask<BitMask=u8> for Mask<_, 2>
+    impl ToBitMask<BitMask=u8> for Mask<_, 4>
+    impl ToBitMask<BitMask=u8> for Mask<_, 8>
+    impl ToBitMask<BitMask=u16> for Mask<_, 16>
+    impl ToBitMask<BitMask=u32> for Mask<_, 32>
+    impl ToBitMask<BitMask=u64> for Mask<_, 64>
+}
+
+/// Returns the minimum numnber of bytes in a bitmask with `lanes` lanes.
+#[cfg(feature = "generic_const_exprs")]
+pub const fn bitmask_len(lanes: usize) -> usize {
+    (lanes + 7) / 8
+}
+
+#[cfg(feature = "generic_const_exprs")]
+impl<T: MaskElement, const LANES: usize> ToBitMaskArray for Mask<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    const BYTES: usize = bitmask_len(LANES);
+
+    fn to_bitmask_array(self) -> [u8; Self::BYTES] {
+        self.0.to_bitmask_array()
+    }
+
+    fn from_bitmask_array(bitmask: [u8; Self::BYTES]) -> Self {
+        Mask(mask_impl::Mask::from_bitmask_array(bitmask))
+    }
 }
diff --git a/crates/core_simd/src/math.rs b/crates/core_simd/src/math.rs
deleted file mode 100644
index 606021e983e..00000000000
--- a/crates/core_simd/src/math.rs
+++ /dev/null
@@ -1,156 +0,0 @@
-use crate::simd::intrinsics::{simd_saturating_add, simd_saturating_sub};
-use crate::simd::{LaneCount, Simd, SupportedLaneCount};
-
-macro_rules! impl_uint_arith {
-    ($($ty:ty),+) => {
-        $( impl<const LANES: usize> Simd<$ty, LANES> where LaneCount<LANES>: SupportedLaneCount {
-
-            /// Lanewise saturating add.
-            ///
-            /// # Examples
-            /// ```
-            /// # #![feature(portable_simd)]
-            /// # use core::simd::Simd;
-            #[doc = concat!("# use core::", stringify!($ty), "::MAX;")]
-            /// let x = Simd::from_array([2, 1, 0, MAX]);
-            /// let max = Simd::splat(MAX);
-            /// let unsat = x + max;
-            /// let sat = x.saturating_add(max);
-            /// assert_eq!(unsat, Simd::from_array([1, 0, MAX, MAX - 1]));
-            /// assert_eq!(sat, max);
-            /// ```
-            #[inline]
-            pub fn saturating_add(self, second: Self) -> Self {
-                // Safety: `self` is a vector
-                unsafe { simd_saturating_add(self, second) }
-            }
-
-            /// Lanewise saturating subtract.
-            ///
-            /// # Examples
-            /// ```
-            /// # #![feature(portable_simd)]
-            /// # use core::simd::Simd;
-            #[doc = concat!("# use core::", stringify!($ty), "::MAX;")]
-            /// let x = Simd::from_array([2, 1, 0, MAX]);
-            /// let max = Simd::splat(MAX);
-            /// let unsat = x - max;
-            /// let sat = x.saturating_sub(max);
-            /// assert_eq!(unsat, Simd::from_array([3, 2, 1, 0]));
-            /// assert_eq!(sat, Simd::splat(0));
-            #[inline]
-            pub fn saturating_sub(self, second: Self) -> Self {
-                // Safety: `self` is a vector
-                unsafe { simd_saturating_sub(self, second) }
-            }
-        })+
-    }
-}
-
-macro_rules! impl_int_arith {
-    ($($ty:ty),+) => {
-        $( impl<const LANES: usize> Simd<$ty, LANES> where LaneCount<LANES>: SupportedLaneCount {
-
-            /// Lanewise saturating add.
-            ///
-            /// # Examples
-            /// ```
-            /// # #![feature(portable_simd)]
-            /// # use core::simd::Simd;
-            #[doc = concat!("# use core::", stringify!($ty), "::{MIN, MAX};")]
-            /// let x = Simd::from_array([MIN, 0, 1, MAX]);
-            /// let max = Simd::splat(MAX);
-            /// let unsat = x + max;
-            /// let sat = x.saturating_add(max);
-            /// assert_eq!(unsat, Simd::from_array([-1, MAX, MIN, -2]));
-            /// assert_eq!(sat, Simd::from_array([-1, MAX, MAX, MAX]));
-            /// ```
-            #[inline]
-            pub fn saturating_add(self, second: Self) -> Self {
-                // Safety: `self` is a vector
-                unsafe { simd_saturating_add(self, second) }
-            }
-
-            /// Lanewise saturating subtract.
-            ///
-            /// # Examples
-            /// ```
-            /// # #![feature(portable_simd)]
-            /// # use core::simd::Simd;
-            #[doc = concat!("# use core::", stringify!($ty), "::{MIN, MAX};")]
-            /// let x = Simd::from_array([MIN, -2, -1, MAX]);
-            /// let max = Simd::splat(MAX);
-            /// let unsat = x - max;
-            /// let sat = x.saturating_sub(max);
-            /// assert_eq!(unsat, Simd::from_array([1, MAX, MIN, 0]));
-            /// assert_eq!(sat, Simd::from_array([MIN, MIN, MIN, 0]));
-            #[inline]
-            pub fn saturating_sub(self, second: Self) -> Self {
-                // Safety: `self` is a vector
-                unsafe { simd_saturating_sub(self, second) }
-            }
-
-            /// Lanewise absolute value, implemented in Rust.
-            /// Every lane becomes its absolute value.
-            ///
-            /// # Examples
-            /// ```
-            /// # #![feature(portable_simd)]
-            /// # use core::simd::Simd;
-            #[doc = concat!("# use core::", stringify!($ty), "::{MIN, MAX};")]
-            /// let xs = Simd::from_array([MIN, MIN +1, -5, 0]);
-            /// assert_eq!(xs.abs(), Simd::from_array([MIN, MAX, 5, 0]));
-            /// ```
-            #[inline]
-            pub fn abs(self) -> Self {
-                const SHR: $ty = <$ty>::BITS as $ty - 1;
-                let m = self >> Simd::splat(SHR);
-                (self^m) - m
-            }
-
-            /// Lanewise saturating absolute value, implemented in Rust.
-            /// As abs(), except the MIN value becomes MAX instead of itself.
-            ///
-            /// # Examples
-            /// ```
-            /// # #![feature(portable_simd)]
-            /// # use core::simd::Simd;
-            #[doc = concat!("# use core::", stringify!($ty), "::{MIN, MAX};")]
-            /// let xs = Simd::from_array([MIN, -2, 0, 3]);
-            /// let unsat = xs.abs();
-            /// let sat = xs.saturating_abs();
-            /// assert_eq!(unsat, Simd::from_array([MIN, 2, 0, 3]));
-            /// assert_eq!(sat, Simd::from_array([MAX, 2, 0, 3]));
-            /// ```
-            #[inline]
-            pub fn saturating_abs(self) -> Self {
-                // arith shift for -1 or 0 mask based on sign bit, giving 2s complement
-                const SHR: $ty = <$ty>::BITS as $ty - 1;
-                let m = self >> Simd::splat(SHR);
-                (self^m).saturating_sub(m)
-            }
-
-            /// Lanewise saturating negation, implemented in Rust.
-            /// As neg(), except the MIN value becomes MAX instead of itself.
-            ///
-            /// # Examples
-            /// ```
-            /// # #![feature(portable_simd)]
-            /// # use core::simd::Simd;
-            #[doc = concat!("# use core::", stringify!($ty), "::{MIN, MAX};")]
-            /// let x = Simd::from_array([MIN, -2, 3, MAX]);
-            /// let unsat = -x;
-            /// let sat = x.saturating_neg();
-            /// assert_eq!(unsat, Simd::from_array([MIN, 2, -3, MIN + 1]));
-            /// assert_eq!(sat, Simd::from_array([MAX, 2, -3, MIN + 1]));
-            /// ```
-            #[inline]
-            pub fn saturating_neg(self) -> Self {
-                Self::splat(0).saturating_sub(self)
-            }
-        })+
-    }
-}
-
-impl_uint_arith! { u8, u16, u32, u64, usize }
-impl_int_arith! { i8, i16, i32, i64, isize }
diff --git a/crates/core_simd/src/mod.rs b/crates/core_simd/src/mod.rs
index 85026265956..b472aa3abe2 100644
--- a/crates/core_simd/src/mod.rs
+++ b/crates/core_simd/src/mod.rs
@@ -1,6 +1,3 @@
-#[macro_use]
-mod reduction;
-
 #[macro_use]
 mod swizzle;
 
@@ -9,14 +6,14 @@ pub(crate) mod intrinsics;
 #[cfg(feature = "generic_const_exprs")]
 mod to_bytes;
 
-mod comparisons;
+mod elements;
+mod eq;
 mod fmt;
 mod iter;
 mod lane_count;
 mod masks;
-mod math;
 mod ops;
-mod round;
+mod ord;
 mod select;
 mod vector;
 mod vendor;
@@ -25,8 +22,11 @@ mod vendor;
 pub mod simd {
     pub(crate) use crate::core_simd::intrinsics;
 
+    pub use crate::core_simd::elements::*;
+    pub use crate::core_simd::eq::*;
     pub use crate::core_simd::lane_count::{LaneCount, SupportedLaneCount};
     pub use crate::core_simd::masks::*;
+    pub use crate::core_simd::ord::*;
     pub use crate::core_simd::swizzle::*;
     pub use crate::core_simd::vector::*;
 }
diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index 1b35b3e717a..5a077a469d8 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -1,4 +1,4 @@
-use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+use crate::simd::{LaneCount, Simd, SimdElement, SimdPartialEq, SupportedLaneCount};
 use core::ops::{Add, Mul};
 use core::ops::{BitAnd, BitOr, BitXor};
 use core::ops::{Div, Rem, Sub};
@@ -33,6 +33,7 @@ where
 
 macro_rules! unsafe_base {
     ($lhs:ident, $rhs:ident, {$simd_call:ident}, $($_:tt)*) => {
+        // Safety: $lhs and $rhs are vectors
         unsafe { $crate::simd::intrinsics::$simd_call($lhs, $rhs) }
     };
 }
@@ -48,6 +49,8 @@ macro_rules! unsafe_base {
 // cg_clif defaults to this, and scalar MIR shifts also default to wrapping
 macro_rules! wrap_bitshift {
     ($lhs:ident, $rhs:ident, {$simd_call:ident}, $int:ident) => {
+        #[allow(clippy::suspicious_arithmetic_impl)]
+        // Safety: $lhs and the bitand result are vectors
         unsafe {
             $crate::simd::intrinsics::$simd_call(
                 $lhs,
@@ -74,7 +77,7 @@ macro_rules! int_divrem_guard {
             $simd_call:ident
         },
         $int:ident ) => {
-        if $rhs.lanes_eq(Simd::splat(0)).any() {
+        if $rhs.simd_eq(Simd::splat(0 as _)).any() {
             panic!($zero);
         } else {
             // Prevent otherwise-UB overflow on the MIN / -1 case.
@@ -82,14 +85,15 @@ macro_rules! int_divrem_guard {
                 // This should, at worst, optimize to a few branchless logical ops
                 // Ideally, this entire conditional should evaporate
                 // Fire LLVM and implement those manually if it doesn't get the hint
-                ($lhs.lanes_eq(Simd::splat(<$int>::MIN))
+                ($lhs.simd_eq(Simd::splat(<$int>::MIN))
                 // type inference can break here, so cut an SInt to size
-                & $rhs.lanes_eq(Simd::splat(-1i64 as _)))
-                .select(Simd::splat(1), $rhs)
+                & $rhs.simd_eq(Simd::splat(-1i64 as _)))
+                .select(Simd::splat(1 as _), $rhs)
             } else {
                 // Nice base case to make it easy to const-fold away the other branch.
                 $rhs
             };
+            // Safety: $lhs and rhs are vectors
             unsafe { $crate::simd::intrinsics::$simd_call($lhs, rhs) }
         }
     };
diff --git a/crates/core_simd/src/ops/unary.rs b/crates/core_simd/src/ops/unary.rs
index 4ebea560fc6..4ad02215034 100644
--- a/crates/core_simd/src/ops/unary.rs
+++ b/crates/core_simd/src/ops/unary.rs
@@ -14,6 +14,7 @@ macro_rules! neg {
             #[inline]
             #[must_use = "operator returns a new vector without mutating the input"]
             fn neg(self) -> Self::Output {
+                // Safety: `self` is a signed vector
                 unsafe { intrinsics::simd_neg(self) }
             }
         })*
diff --git a/crates/core_simd/src/ord.rs b/crates/core_simd/src/ord.rs
new file mode 100644
index 00000000000..9a87bc2e344
--- /dev/null
+++ b/crates/core_simd/src/ord.rs
@@ -0,0 +1,213 @@
+use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
+
+/// Parallel `PartialOrd`.
+pub trait SimdPartialOrd: SimdPartialEq {
+    /// Test if each lane is less than the corresponding lane in `other`.
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    fn simd_lt(self, other: Self) -> Self::Mask;
+
+    /// Test if each lane is less than or equal to the corresponding lane in `other`.
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    fn simd_le(self, other: Self) -> Self::Mask;
+
+    /// Test if each lane is greater than the corresponding lane in `other`.
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    fn simd_gt(self, other: Self) -> Self::Mask;
+
+    /// Test if each lane is greater than or equal to the corresponding lane in `other`.
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    fn simd_ge(self, other: Self) -> Self::Mask;
+}
+
+/// Parallel `Ord`.
+pub trait SimdOrd: SimdPartialOrd {
+    /// Returns the lane-wise maximum with `other`.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn simd_max(self, other: Self) -> Self;
+
+    /// Returns the lane-wise minimum with `other`.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn simd_min(self, other: Self) -> Self;
+
+    /// Restrict each lane to a certain interval.
+    ///
+    /// For each lane, returns `max` if `self` is greater than `max`, and `min` if `self` is
+    /// less than `min`. Otherwise returns `self`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `min > max` on any lane.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn simd_clamp(self, min: Self, max: Self) -> Self;
+}
+
+macro_rules! impl_integer {
+    { $($integer:ty),* } => {
+        $(
+        impl<const LANES: usize> SimdPartialOrd for Simd<$integer, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            #[inline]
+            fn simd_lt(self, other: Self) -> Self::Mask {
+                // Safety: `self` is a vector, and the result of the comparison
+                // is always a valid mask.
+                unsafe { Mask::from_int_unchecked(intrinsics::simd_lt(self, other)) }
+            }
+
+            #[inline]
+            fn simd_le(self, other: Self) -> Self::Mask {
+                // Safety: `self` is a vector, and the result of the comparison
+                // is always a valid mask.
+                unsafe { Mask::from_int_unchecked(intrinsics::simd_le(self, other)) }
+            }
+
+            #[inline]
+            fn simd_gt(self, other: Self) -> Self::Mask {
+                // Safety: `self` is a vector, and the result of the comparison
+                // is always a valid mask.
+                unsafe { Mask::from_int_unchecked(intrinsics::simd_gt(self, other)) }
+            }
+
+            #[inline]
+            fn simd_ge(self, other: Self) -> Self::Mask {
+                // Safety: `self` is a vector, and the result of the comparison
+                // is always a valid mask.
+                unsafe { Mask::from_int_unchecked(intrinsics::simd_ge(self, other)) }
+            }
+        }
+
+        impl<const LANES: usize> SimdOrd for Simd<$integer, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            #[inline]
+            fn simd_max(self, other: Self) -> Self {
+                self.simd_lt(other).select(other, self)
+            }
+
+            #[inline]
+            fn simd_min(self, other: Self) -> Self {
+                self.simd_gt(other).select(other, self)
+            }
+
+            #[inline]
+            fn simd_clamp(self, min: Self, max: Self) -> Self {
+                assert!(
+                    min.simd_le(max).all(),
+                    "each lane in `min` must be less than or equal to the corresponding lane in `max`",
+                );
+                self.simd_max(min).simd_min(max)
+            }
+        }
+        )*
+    }
+}
+
+impl_integer! { u8, u16, u32, u64, usize, i8, i16, i32, i64, isize }
+
+macro_rules! impl_float {
+    { $($float:ty),* } => {
+        $(
+        impl<const LANES: usize> SimdPartialOrd for Simd<$float, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            #[inline]
+            fn simd_lt(self, other: Self) -> Self::Mask {
+                // Safety: `self` is a vector, and the result of the comparison
+                // is always a valid mask.
+                unsafe { Mask::from_int_unchecked(intrinsics::simd_lt(self, other)) }
+            }
+
+            #[inline]
+            fn simd_le(self, other: Self) -> Self::Mask {
+                // Safety: `self` is a vector, and the result of the comparison
+                // is always a valid mask.
+                unsafe { Mask::from_int_unchecked(intrinsics::simd_le(self, other)) }
+            }
+
+            #[inline]
+            fn simd_gt(self, other: Self) -> Self::Mask {
+                // Safety: `self` is a vector, and the result of the comparison
+                // is always a valid mask.
+                unsafe { Mask::from_int_unchecked(intrinsics::simd_gt(self, other)) }
+            }
+
+            #[inline]
+            fn simd_ge(self, other: Self) -> Self::Mask {
+                // Safety: `self` is a vector, and the result of the comparison
+                // is always a valid mask.
+                unsafe { Mask::from_int_unchecked(intrinsics::simd_ge(self, other)) }
+            }
+        }
+        )*
+    }
+}
+
+impl_float! { f32, f64 }
+
+macro_rules! impl_mask {
+    { $($integer:ty),* } => {
+        $(
+        impl<const LANES: usize> SimdPartialOrd for Mask<$integer, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            #[inline]
+            fn simd_lt(self, other: Self) -> Self::Mask {
+                // Safety: `self` is a vector, and the result of the comparison
+                // is always a valid mask.
+                unsafe { Self::from_int_unchecked(intrinsics::simd_lt(self.to_int(), other.to_int())) }
+            }
+
+            #[inline]
+            fn simd_le(self, other: Self) -> Self::Mask {
+                // Safety: `self` is a vector, and the result of the comparison
+                // is always a valid mask.
+                unsafe { Self::from_int_unchecked(intrinsics::simd_le(self.to_int(), other.to_int())) }
+            }
+
+            #[inline]
+            fn simd_gt(self, other: Self) -> Self::Mask {
+                // Safety: `self` is a vector, and the result of the comparison
+                // is always a valid mask.
+                unsafe { Self::from_int_unchecked(intrinsics::simd_gt(self.to_int(), other.to_int())) }
+            }
+
+            #[inline]
+            fn simd_ge(self, other: Self) -> Self::Mask {
+                // Safety: `self` is a vector, and the result of the comparison
+                // is always a valid mask.
+                unsafe { Self::from_int_unchecked(intrinsics::simd_ge(self.to_int(), other.to_int())) }
+            }
+        }
+
+        impl<const LANES: usize> SimdOrd for Mask<$integer, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            #[inline]
+            fn simd_max(self, other: Self) -> Self {
+                self.simd_gt(other).select_mask(other, self)
+            }
+
+            #[inline]
+            fn simd_min(self, other: Self) -> Self {
+                self.simd_lt(other).select_mask(other, self)
+            }
+
+            #[inline]
+            fn simd_clamp(self, min: Self, max: Self) -> Self {
+                assert!(
+                    min.simd_le(max).all(),
+                    "each lane in `min` must be less than or equal to the corresponding lane in `max`",
+                );
+                self.simd_max(min).simd_min(max)
+            }
+        }
+        )*
+    }
+}
+
+impl_mask! { i8, i16, i32, i64, isize }
diff --git a/crates/core_simd/src/reduction.rs b/crates/core_simd/src/reduction.rs
deleted file mode 100644
index 3177fd167fc..00000000000
--- a/crates/core_simd/src/reduction.rs
+++ /dev/null
@@ -1,153 +0,0 @@
-use crate::simd::intrinsics::{
-    simd_reduce_add_ordered, simd_reduce_and, simd_reduce_max, simd_reduce_min,
-    simd_reduce_mul_ordered, simd_reduce_or, simd_reduce_xor,
-};
-use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
-use core::ops::{BitAnd, BitOr, BitXor};
-
-macro_rules! impl_integer_reductions {
-    { $scalar:ty } => {
-        impl<const LANES: usize> Simd<$scalar, LANES>
-        where
-            LaneCount<LANES>: SupportedLaneCount,
-        {
-            /// Reducing wrapping add.  Returns the sum of the lanes of the vector, with wrapping addition.
-            #[inline]
-            pub fn reduce_sum(self) -> $scalar {
-                // Safety: `self` is an integer vector
-                unsafe { simd_reduce_add_ordered(self, 0) }
-            }
-
-            /// Reducing wrapping multiply.  Returns the product of the lanes of the vector, with wrapping multiplication.
-            #[inline]
-            pub fn reduce_product(self) -> $scalar {
-                // Safety: `self` is an integer vector
-                unsafe { simd_reduce_mul_ordered(self, 1) }
-            }
-
-            /// Reducing maximum.  Returns the maximum lane in the vector.
-            #[inline]
-            pub fn reduce_max(self) -> $scalar {
-                // Safety: `self` is an integer vector
-                unsafe { simd_reduce_max(self) }
-            }
-
-            /// Reducing minimum.  Returns the minimum lane in the vector.
-            #[inline]
-            pub fn reduce_min(self) -> $scalar {
-                // Safety: `self` is an integer vector
-                unsafe { simd_reduce_min(self) }
-            }
-        }
-    }
-}
-
-impl_integer_reductions! { i8 }
-impl_integer_reductions! { i16 }
-impl_integer_reductions! { i32 }
-impl_integer_reductions! { i64 }
-impl_integer_reductions! { isize }
-impl_integer_reductions! { u8 }
-impl_integer_reductions! { u16 }
-impl_integer_reductions! { u32 }
-impl_integer_reductions! { u64 }
-impl_integer_reductions! { usize }
-
-macro_rules! impl_float_reductions {
-    { $scalar:ty } => {
-        impl<const LANES: usize> Simd<$scalar, LANES>
-        where
-            LaneCount<LANES>: SupportedLaneCount,
-        {
-
-            /// Reducing add.  Returns the sum of the lanes of the vector.
-            #[inline]
-            pub fn reduce_sum(self) -> $scalar {
-                // LLVM sum is inaccurate on i586
-                if cfg!(all(target_arch = "x86", not(target_feature = "sse2"))) {
-                    self.as_array().iter().sum()
-                } else {
-                    // Safety: `self` is a float vector
-                    unsafe { simd_reduce_add_ordered(self, 0.) }
-                }
-            }
-
-            /// Reducing multiply.  Returns the product of the lanes of the vector.
-            #[inline]
-            pub fn reduce_product(self) -> $scalar {
-                // LLVM product is inaccurate on i586
-                if cfg!(all(target_arch = "x86", not(target_feature = "sse2"))) {
-                    self.as_array().iter().product()
-                } else {
-                    // Safety: `self` is a float vector
-                    unsafe { simd_reduce_mul_ordered(self, 1.) }
-                }
-            }
-
-            /// Reducing maximum.  Returns the maximum lane in the vector.
-            ///
-            /// Returns values based on equality, so a vector containing both `0.` and `-0.` may
-            /// return either.  This function will not return `NaN` unless all lanes are `NaN`.
-            #[inline]
-            pub fn reduce_max(self) -> $scalar {
-                // Safety: `self` is a float vector
-                unsafe { simd_reduce_max(self) }
-            }
-
-            /// Reducing minimum.  Returns the minimum lane in the vector.
-            ///
-            /// Returns values based on equality, so a vector containing both `0.` and `-0.` may
-            /// return either.  This function will not return `NaN` unless all lanes are `NaN`.
-            #[inline]
-            pub fn reduce_min(self) -> $scalar {
-                // Safety: `self` is a float vector
-                unsafe { simd_reduce_min(self) }
-            }
-        }
-    }
-}
-
-impl_float_reductions! { f32 }
-impl_float_reductions! { f64 }
-
-impl<T, const LANES: usize> Simd<T, LANES>
-where
-    Self: BitAnd<Self, Output = Self>,
-    T: SimdElement + BitAnd<T, Output = T>,
-    LaneCount<LANES>: SupportedLaneCount,
-{
-    /// Reducing bitwise "and".  Returns the cumulative bitwise "and" across the lanes of
-    /// the vector.
-    #[inline]
-    pub fn reduce_and(self) -> T {
-        unsafe { simd_reduce_and(self) }
-    }
-}
-
-impl<T, const LANES: usize> Simd<T, LANES>
-where
-    Self: BitOr<Self, Output = Self>,
-    T: SimdElement + BitOr<T, Output = T>,
-    LaneCount<LANES>: SupportedLaneCount,
-{
-    /// Reducing bitwise "or".  Returns the cumulative bitwise "or" across the lanes of
-    /// the vector.
-    #[inline]
-    pub fn reduce_or(self) -> T {
-        unsafe { simd_reduce_or(self) }
-    }
-}
-
-impl<T, const LANES: usize> Simd<T, LANES>
-where
-    Self: BitXor<Self, Output = Self>,
-    T: SimdElement + BitXor<T, Output = T>,
-    LaneCount<LANES>: SupportedLaneCount,
-{
-    /// Reducing bitwise "xor".  Returns the cumulative bitwise "xor" across the lanes of
-    /// the vector.
-    #[inline]
-    pub fn reduce_xor(self) -> T {
-        unsafe { simd_reduce_xor(self) }
-    }
-}
diff --git a/crates/core_simd/src/round.rs b/crates/core_simd/src/round.rs
deleted file mode 100644
index 556bc2cc1fe..00000000000
--- a/crates/core_simd/src/round.rs
+++ /dev/null
@@ -1,40 +0,0 @@
-use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
-use core::convert::FloatToInt;
-
-macro_rules! implement {
-    {
-        $type:ty
-    } => {
-        impl<const LANES: usize> Simd<$type, LANES>
-        where
-            LaneCount<LANES>: SupportedLaneCount,
-        {
-            /// Rounds toward zero and converts to the same-width integer type, assuming that
-            /// the value is finite and fits in that type.
-            ///
-            /// # Safety
-            /// The value must:
-            ///
-            /// * Not be NaN
-            /// * Not be infinite
-            /// * Be representable in the return type, after truncating off its fractional part
-            ///
-            /// If these requirements are infeasible or costly, consider using the safe function [cast],
-            /// which saturates on conversion.
-            ///
-            /// [cast]: Simd::cast
-            #[inline]
-            pub unsafe fn to_int_unchecked<I>(self) -> Simd<I, LANES>
-            where
-                $type: FloatToInt<I>,
-                I: SimdElement,
-            {
-                unsafe { intrinsics::simd_cast(self) }
-            }
-        }
-    }
-}
-
-implement! { f32 }
-implement! { f64 }
diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index ef47c4f3a4c..22999d24950 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -1,44 +1,46 @@
 use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
 
-/// Constructs a new vector by selecting values from the lanes of the source vector or vectors to use.
+/// Constructs a new SIMD vector by copying elements from selected lanes in other vectors.
 ///
-/// When swizzling one vector, the indices of the result vector are indicated by a `const` array
-/// of `usize`, like [`Swizzle`].
-/// When swizzling two vectors, the indices are indicated by a `const` array of [`Which`], like
-/// [`Swizzle2`].
+/// When swizzling one vector, lanes are selected by a `const` array of `usize`,
+/// like [`Swizzle`].
+///
+/// When swizzling two vectors, lanes are selected by a `const` array of [`Which`],
+/// like [`Swizzle2`].
 ///
 /// # Examples
-/// ## One source vector
+///
+/// With a single SIMD vector, the const array specifies lane indices in that vector:
 /// ```
 /// # #![feature(portable_simd)]
-/// # use core::simd::{Simd, simd_swizzle};
-/// let v = Simd::<f32, 4>::from_array([0., 1., 2., 3.]);
+/// # use core::simd::{u32x2, u32x4, simd_swizzle};
+/// let v = u32x4::from_array([10, 11, 12, 13]);
 ///
 /// // Keeping the same size
-/// let r = simd_swizzle!(v, [3, 0, 1, 2]);
-/// assert_eq!(r.to_array(), [3., 0., 1., 2.]);
+/// let r: u32x4 = simd_swizzle!(v, [3, 0, 1, 2]);
+/// assert_eq!(r.to_array(), [13, 10, 11, 12]);
 ///
 /// // Changing the number of lanes
-/// let r = simd_swizzle!(v, [3, 1]);
-/// assert_eq!(r.to_array(), [3., 1.]);
+/// let r: u32x2 = simd_swizzle!(v, [3, 1]);
+/// assert_eq!(r.to_array(), [13, 11]);
 /// ```
 ///
-/// ## Two source vectors
+/// With two input SIMD vectors, the const array uses `Which` to specify the source of each index:
 /// ```
 /// # #![feature(portable_simd)]
-/// # use core::simd::{Simd, simd_swizzle, Which};
-/// use Which::*;
-/// let a = Simd::<f32, 4>::from_array([0., 1., 2., 3.]);
-/// let b = Simd::<f32, 4>::from_array([4., 5., 6., 7.]);
+/// # use core::simd::{u32x2, u32x4, simd_swizzle, Which};
+/// use Which::{First, Second};
+/// let a = u32x4::from_array([0, 1, 2, 3]);
+/// let b = u32x4::from_array([4, 5, 6, 7]);
 ///
 /// // Keeping the same size
-/// let r = simd_swizzle!(a, b, [First(0), First(1), Second(2), Second(3)]);
-/// assert_eq!(r.to_array(), [0., 1., 6., 7.]);
+/// let r: u32x4 = simd_swizzle!(a, b, [First(0), First(1), Second(2), Second(3)]);
+/// assert_eq!(r.to_array(), [0, 1, 6, 7]);
 ///
 /// // Changing the number of lanes
-/// let r = simd_swizzle!(a, b, [First(0), Second(0)]);
-/// assert_eq!(r.to_array(), [0., 4.]);
+/// let r: u32x2 = simd_swizzle!(a, b, [First(0), Second(0)]);
+/// assert_eq!(r.to_array(), [0, 4]);
 /// ```
 #[allow(unused_macros)]
 pub macro simd_swizzle {
@@ -68,12 +70,14 @@ pub macro simd_swizzle {
     }
 }
 
-/// An index into one of two vectors.
+/// Specifies a lane index into one of two SIMD vectors.
+///
+/// This is an input type for [Swizzle2] and helper macros like [simd_swizzle].
 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub enum Which {
-    /// Indexes the first vector.
+    /// Index of a lane in the first input SIMD vector.
     First(usize),
-    /// Indexes the second vector.
+    /// Index of a lane in the second input SIMD vector.
     Second(usize),
 }
 
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index b9cd2e2021e..78f56402eba 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -9,8 +9,9 @@ pub use uint::*;
 // Vectors of pointers are not for public use at the current time.
 pub(crate) mod ptr;
 
-use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Mask, MaskElement, SupportedLaneCount};
+use crate::simd::{
+    intrinsics, LaneCount, Mask, MaskElement, SimdPartialOrd, SupportedLaneCount, Swizzle,
+};
 
 /// A SIMD vector of `LANES` elements of type `T`. `Simd<T, N>` has the same shape as [`[T; N]`](array), but operates like `T`.
 ///
@@ -99,17 +100,50 @@ where
     /// Number of lanes in this vector.
     pub const LANES: usize = LANES;
 
-    /// Get the number of lanes in this vector.
+    /// Returns the number of lanes in this SIMD vector.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # use core::simd::u32x4;
+    /// let v = u32x4::splat(0);
+    /// assert_eq!(v.lanes(), 4);
+    /// ```
     pub const fn lanes(&self) -> usize {
         LANES
     }
 
-    /// Construct a SIMD vector by setting all lanes to the given value.
-    pub const fn splat(value: T) -> Self {
-        Self([value; LANES])
+    /// Constructs a new SIMD vector with all lanes set to the given value.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # use core::simd::u32x4;
+    /// let v = u32x4::splat(8);
+    /// assert_eq!(v.as_array(), &[8, 8, 8, 8]);
+    /// ```
+    pub fn splat(value: T) -> Self {
+        // This is preferred over `[value; LANES]`, since it's explicitly a splat:
+        // https://github.com/rust-lang/rust/issues/97804
+        struct Splat;
+        impl<const LANES: usize> Swizzle<1, LANES> for Splat {
+            const INDEX: [usize; LANES] = [0; LANES];
+        }
+        Splat::swizzle(Simd::<T, 1>::from([value]))
     }
 
     /// Returns an array reference containing the entire SIMD vector.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # use core::simd::{Simd, u64x4};
+    /// let v: u64x4 = Simd::from_array([0, 1, 2, 3]);
+    /// assert_eq!(v.as_array(), &[0, 1, 2, 3]);
+    /// ```
     pub const fn as_array(&self) -> &[T; LANES] {
         &self.0
     }
@@ -129,9 +163,21 @@ where
         self.0
     }
 
-    /// Converts a slice to a SIMD vector containing `slice[..LANES]`
+    /// Converts a slice to a SIMD vector containing `slice[..LANES]`.
+    ///
     /// # Panics
-    /// `from_slice` will panic if the slice's `len` is less than the vector's `Simd::LANES`.
+    ///
+    /// Panics if the slice's length is less than the vector's `Simd::LANES`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # use core::simd::u32x4;
+    /// let source = vec![1, 2, 3, 4, 5, 6];
+    /// let v = u32x4::from_slice(&source);
+    /// assert_eq!(v.as_array(), &[1, 2, 3, 4]);
+    /// ```
     #[must_use]
     pub const fn from_slice(slice: &[T]) -> Self {
         assert!(slice.len() >= LANES, "slice length must be at least the number of lanes");
@@ -145,6 +191,7 @@ where
     }
 
     /// Performs lanewise conversion of a SIMD vector's elements to another SIMD-valid type.
+    ///
     /// This follows the semantics of Rust's `as` conversion for casting
     /// integers to unsigned integers (interpreting as the other type, so `-1` to `MAX`),
     /// and from floats to integers (truncating, or saturating at the limits) for each lane,
@@ -169,10 +216,35 @@ where
     #[must_use]
     #[inline]
     pub fn cast<U: SimdElement>(self) -> Simd<U, LANES> {
-        // Safety: The input argument is a vector of a known SIMD type.
+        // Safety: The input argument is a vector of a valid SIMD element type.
         unsafe { intrinsics::simd_as(self) }
     }
 
+    /// Rounds toward zero and converts to the same-width integer type, assuming that
+    /// the value is finite and fits in that type.
+    ///
+    /// # Safety
+    /// The value must:
+    ///
+    /// * Not be NaN
+    /// * Not be infinite
+    /// * Be representable in the return type, after truncating off its fractional part
+    ///
+    /// If these requirements are infeasible or costly, consider using the safe function [cast],
+    /// which saturates on conversion.
+    ///
+    /// [cast]: Simd::cast
+    #[inline]
+    pub unsafe fn to_int_unchecked<I>(self) -> Simd<I, LANES>
+    where
+        T: core::convert::FloatToInt<I>,
+        I: SimdElement,
+    {
+        // Safety: `self` is a vector, and `FloatToInt` ensures the type can be casted to
+        // an integer.
+        unsafe { intrinsics::simd_cast(self) }
+    }
+
     /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
     /// If an index is out-of-bounds, the lane is instead selected from the `or` vector.
     ///
@@ -239,7 +311,7 @@ where
         idxs: Simd<usize, LANES>,
         or: Self,
     ) -> Self {
-        let enable: Mask<isize, LANES> = enable & idxs.lanes_lt(Simd::splat(slice.len()));
+        let enable: Mask<isize, LANES> = enable & idxs.simd_lt(Simd::splat(slice.len()));
         // Safety: We have masked-off out-of-bounds lanes.
         unsafe { Self::gather_select_unchecked(slice, enable, idxs, or) }
     }
@@ -256,13 +328,15 @@ where
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
-    /// # use core::simd::{Simd, Mask};
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, SimdPartialOrd, Mask};
     /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
     /// let idxs = Simd::from_array([9, 3, 0, 5]);
     /// let alt = Simd::from_array([-5, -4, -3, -2]);
     /// let enable = Mask::from_array([true, true, true, false]); // Note the final mask lane.
     /// // If this mask was used to gather, it would be unsound. Let's fix that.
-    /// let enable = enable & idxs.lanes_lt(Simd::splat(vec.len()));
+    /// let enable = enable & idxs.simd_lt(Simd::splat(vec.len()));
     ///
     /// // We have masked the OOB lane, so it's safe to gather now.
     /// let result = unsafe { Simd::gather_select_unchecked(&vec, enable, idxs, alt) };
@@ -313,7 +387,9 @@ where
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
-    /// # use core::simd::{Simd, Mask};
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, Mask};
     /// let mut vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
     /// let idxs = Simd::from_array([9, 3, 0, 0]);
     /// let vals = Simd::from_array([-27, 82, -41, 124]);
@@ -329,7 +405,7 @@ where
         enable: Mask<isize, LANES>,
         idxs: Simd<usize, LANES>,
     ) {
-        let enable: Mask<isize, LANES> = enable & idxs.lanes_lt(Simd::splat(slice.len()));
+        let enable: Mask<isize, LANES> = enable & idxs.simd_lt(Simd::splat(slice.len()));
         // Safety: We have masked-off out-of-bounds lanes.
         unsafe { self.scatter_select_unchecked(slice, enable, idxs) }
     }
@@ -347,13 +423,15 @@ where
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
-    /// # use core::simd::{Simd, Mask};
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, SimdPartialOrd, Mask};
     /// let mut vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
     /// let idxs = Simd::from_array([9, 3, 0, 0]);
     /// let vals = Simd::from_array([-27, 82, -41, 124]);
     /// let enable = Mask::from_array([true, true, true, false]); // Note the mask of the last lane.
     /// // If this mask was used to scatter, it would be unsound. Let's fix that.
-    /// let enable = enable & idxs.lanes_lt(Simd::splat(vec.len()));
+    /// let enable = enable & idxs.simd_lt(Simd::splat(vec.len()));
     ///
     /// // We have masked the OOB lane, so it's safe to scatter now.
     /// unsafe { vals.scatter_select_unchecked(&mut vec, enable, idxs); }
@@ -425,8 +503,27 @@ where
 {
     #[inline]
     fn eq(&self, other: &Self) -> bool {
-        // TODO use SIMD equality
-        self.to_array() == other.to_array()
+        // Safety: All SIMD vectors are SimdPartialEq, and the comparison produces a valid mask.
+        let mask = unsafe {
+            let tfvec: Simd<<T as SimdElement>::Mask, LANES> = intrinsics::simd_eq(*self, *other);
+            Mask::from_int_unchecked(tfvec)
+        };
+
+        // Two vectors are equal if all lanes tested true for vertical equality.
+        mask.all()
+    }
+
+    #[allow(clippy::partialeq_ne_impl)]
+    #[inline]
+    fn ne(&self, other: &Self) -> bool {
+        // Safety: All SIMD vectors are SimdPartialEq, and the comparison produces a valid mask.
+        let mask = unsafe {
+            let tfvec: Simd<<T as SimdElement>::Mask, LANES> = intrinsics::simd_ne(*self, *other);
+            Mask::from_int_unchecked(tfvec)
+        };
+
+        // Two vectors are non-equal if any lane tested true for vertical non-equality.
+        mask.any()
     }
 }
 
@@ -561,61 +658,85 @@ pub unsafe trait SimdElement: Sealed + Copy {
 }
 
 impl Sealed for u8 {}
+
+// Safety: u8 is a valid SIMD element type, and is supported by this API
 unsafe impl SimdElement for u8 {
     type Mask = i8;
 }
 
 impl Sealed for u16 {}
+
+// Safety: u16 is a valid SIMD element type, and is supported by this API
 unsafe impl SimdElement for u16 {
     type Mask = i16;
 }
 
 impl Sealed for u32 {}
+
+// Safety: u32 is a valid SIMD element type, and is supported by this API
 unsafe impl SimdElement for u32 {
     type Mask = i32;
 }
 
 impl Sealed for u64 {}
+
+// Safety: u64 is a valid SIMD element type, and is supported by this API
 unsafe impl SimdElement for u64 {
     type Mask = i64;
 }
 
 impl Sealed for usize {}
+
+// Safety: usize is a valid SIMD element type, and is supported by this API
 unsafe impl SimdElement for usize {
     type Mask = isize;
 }
 
 impl Sealed for i8 {}
+
+// Safety: i8 is a valid SIMD element type, and is supported by this API
 unsafe impl SimdElement for i8 {
     type Mask = i8;
 }
 
 impl Sealed for i16 {}
+
+// Safety: i16 is a valid SIMD element type, and is supported by this API
 unsafe impl SimdElement for i16 {
     type Mask = i16;
 }
 
 impl Sealed for i32 {}
+
+// Safety: i32 is a valid SIMD element type, and is supported by this API
 unsafe impl SimdElement for i32 {
     type Mask = i32;
 }
 
 impl Sealed for i64 {}
+
+// Safety: i64 is a valid SIMD element type, and is supported by this API
 unsafe impl SimdElement for i64 {
     type Mask = i64;
 }
 
 impl Sealed for isize {}
+
+// Safety: isize is a valid SIMD element type, and is supported by this API
 unsafe impl SimdElement for isize {
     type Mask = isize;
 }
 
 impl Sealed for f32 {}
+
+// Safety: f32 is a valid SIMD element type, and is supported by this API
 unsafe impl SimdElement for f32 {
     type Mask = i32;
 }
 
 impl Sealed for f64 {}
+
+// Safety: f64 is a valid SIMD element type, and is supported by this API
 unsafe impl SimdElement for f64 {
     type Mask = i64;
 }
diff --git a/crates/core_simd/src/vector/float.rs b/crates/core_simd/src/vector/float.rs
index fcc7f6d8d1c..f836c99b1e2 100644
--- a/crates/core_simd/src/vector/float.rs
+++ b/crates/core_simd/src/vector/float.rs
@@ -1,199 +1,24 @@
 #![allow(non_camel_case_types)]
 
-use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Mask, Simd, SupportedLaneCount};
+use crate::simd::Simd;
 
-/// Implements inherent methods for a float vector containing multiple
-/// `$lanes` of float `$type`, which uses `$bits_ty` as its binary
-/// representation.
-macro_rules! impl_float_vector {
-    { $type:ty, $bits_ty:ty, $mask_ty:ty } => {
-        impl<const LANES: usize> Simd<$type, LANES>
-        where
-            LaneCount<LANES>: SupportedLaneCount,
-        {
-            /// Raw transmutation to an unsigned integer vector type with the
-            /// same size and number of lanes.
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            pub fn to_bits(self) -> Simd<$bits_ty, LANES> {
-                assert_eq!(core::mem::size_of::<Self>(), core::mem::size_of::<Simd<$bits_ty, LANES>>());
-                unsafe { core::mem::transmute_copy(&self) }
-            }
-
-            /// Raw transmutation from an unsigned integer vector type with the
-            /// same size and number of lanes.
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            pub fn from_bits(bits: Simd<$bits_ty, LANES>) -> Self {
-                assert_eq!(core::mem::size_of::<Self>(), core::mem::size_of::<Simd<$bits_ty, LANES>>());
-                unsafe { core::mem::transmute_copy(&bits) }
-            }
-
-            /// Produces a vector where every lane has the absolute value of the
-            /// equivalently-indexed lane in `self`.
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            pub fn abs(self) -> Self {
-                unsafe { intrinsics::simd_fabs(self) }
-            }
-
-            /// Takes the reciprocal (inverse) of each lane, `1/x`.
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            pub fn recip(self) -> Self {
-                Self::splat(1.0) / self
-            }
-
-            /// Converts each lane from radians to degrees.
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            pub fn to_degrees(self) -> Self {
-                // to_degrees uses a special constant for better precision, so extract that constant
-                self * Self::splat(<$type>::to_degrees(1.))
-            }
-
-            /// Converts each lane from degrees to radians.
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            pub fn to_radians(self) -> Self {
-                self * Self::splat(<$type>::to_radians(1.))
-            }
-
-            /// Returns true for each lane if it has a positive sign, including
-            /// `+0.0`, `NaN`s with positive sign bit and positive infinity.
-            #[inline]
-            #[must_use = "method returns a new mask and does not mutate the original value"]
-            pub fn is_sign_positive(self) -> Mask<$mask_ty, LANES> {
-                !self.is_sign_negative()
-            }
-
-            /// Returns true for each lane if it has a negative sign, including
-            /// `-0.0`, `NaN`s with negative sign bit and negative infinity.
-            #[inline]
-            #[must_use = "method returns a new mask and does not mutate the original value"]
-            pub fn is_sign_negative(self) -> Mask<$mask_ty, LANES> {
-                let sign_bits = self.to_bits() & Simd::splat((!0 >> 1) + 1);
-                sign_bits.lanes_gt(Simd::splat(0))
-            }
-
-            /// Returns true for each lane if its value is `NaN`.
-            #[inline]
-            #[must_use = "method returns a new mask and does not mutate the original value"]
-            pub fn is_nan(self) -> Mask<$mask_ty, LANES> {
-                self.lanes_ne(self)
-            }
-
-            /// Returns true for each lane if its value is positive infinity or negative infinity.
-            #[inline]
-            #[must_use = "method returns a new mask and does not mutate the original value"]
-            pub fn is_infinite(self) -> Mask<$mask_ty, LANES> {
-                self.abs().lanes_eq(Self::splat(<$type>::INFINITY))
-            }
-
-            /// Returns true for each lane if its value is neither infinite nor `NaN`.
-            #[inline]
-            #[must_use = "method returns a new mask and does not mutate the original value"]
-            pub fn is_finite(self) -> Mask<$mask_ty, LANES> {
-                self.abs().lanes_lt(Self::splat(<$type>::INFINITY))
-            }
-
-            /// Returns true for each lane if its value is subnormal.
-            #[inline]
-            #[must_use = "method returns a new mask and does not mutate the original value"]
-            pub fn is_subnormal(self) -> Mask<$mask_ty, LANES> {
-                self.abs().lanes_ne(Self::splat(0.0)) & (self.to_bits() & Self::splat(<$type>::INFINITY).to_bits()).lanes_eq(Simd::splat(0))
-            }
-
-            /// Returns true for each lane if its value is neither zero, infinite,
-            /// subnormal, nor `NaN`.
-            #[inline]
-            #[must_use = "method returns a new mask and does not mutate the original value"]
-            pub fn is_normal(self) -> Mask<$mask_ty, LANES> {
-                !(self.abs().lanes_eq(Self::splat(0.0)) | self.is_nan() | self.is_subnormal() | self.is_infinite())
-            }
-
-            /// Replaces each lane with a number that represents its sign.
-            ///
-            /// * `1.0` if the number is positive, `+0.0`, or `INFINITY`
-            /// * `-1.0` if the number is negative, `-0.0`, or `NEG_INFINITY`
-            /// * `NAN` if the number is `NAN`
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            pub fn signum(self) -> Self {
-                self.is_nan().select(Self::splat(<$type>::NAN), Self::splat(1.0).copysign(self))
-            }
-
-            /// Returns each lane with the magnitude of `self` and the sign of `sign`.
-            ///
-            /// If any lane is a `NAN`, then a `NAN` with the sign of `sign` is returned.
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            pub fn copysign(self, sign: Self) -> Self {
-                let sign_bit = sign.to_bits() & Self::splat(-0.).to_bits();
-                let magnitude = self.to_bits() & !Self::splat(-0.).to_bits();
-                Self::from_bits(sign_bit | magnitude)
-            }
-
-            /// Returns the minimum of each lane.
-            ///
-            /// If one of the values is `NAN`, then the other value is returned.
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            pub fn min(self, other: Self) -> Self {
-                unsafe { intrinsics::simd_fmin(self, other) }
-            }
-
-            /// Returns the maximum of each lane.
-            ///
-            /// If one of the values is `NAN`, then the other value is returned.
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            pub fn max(self, other: Self) -> Self {
-                unsafe { intrinsics::simd_fmax(self, other) }
-            }
-
-            /// Restrict each lane to a certain interval unless it is NaN.
-            ///
-            /// For each lane in `self`, returns the corresponding lane in `max` if the lane is
-            /// greater than `max`, and the corresponding lane in `min` if the lane is less
-            /// than `min`.  Otherwise returns the lane in `self`.
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            pub fn clamp(self, min: Self, max: Self) -> Self {
-                assert!(
-                    min.lanes_le(max).all(),
-                    "each lane in `min` must be less than or equal to the corresponding lane in `max`",
-                );
-                let mut x = self;
-                x = x.lanes_lt(min).select(min, x);
-                x = x.lanes_gt(max).select(max, x);
-                x
-            }
-        }
-    };
-}
-
-impl_float_vector! { f32, u32, i32 }
-impl_float_vector! { f64, u64, i64 }
-
-/// Vector of two `f32` values
+/// A 64-bit SIMD vector with two elements of type `f32`.
 pub type f32x2 = Simd<f32, 2>;
 
-/// Vector of four `f32` values
+/// A 128-bit SIMD vector with four elements of type `f32`.
 pub type f32x4 = Simd<f32, 4>;
 
-/// Vector of eight `f32` values
+/// A 256-bit SIMD vector with eight elements of type `f32`.
 pub type f32x8 = Simd<f32, 8>;
 
-/// Vector of 16 `f32` values
+/// A 512-bit SIMD vector with 16 elements of type `f32`.
 pub type f32x16 = Simd<f32, 16>;
 
-/// Vector of two `f64` values
+/// A 128-bit SIMD vector with two elements of type `f64`.
 pub type f64x2 = Simd<f64, 2>;
 
-/// Vector of four `f64` values
+/// A 256-bit SIMD vector with four elements of type `f64`.
 pub type f64x4 = Simd<f64, 4>;
 
-/// Vector of eight `f64` values
+/// A 512-bit SIMD vector with eight elements of type `f64`.
 pub type f64x8 = Simd<f64, 8>;
diff --git a/crates/core_simd/src/vector/int.rs b/crates/core_simd/src/vector/int.rs
index 3eac02a2761..20e56c7dc64 100644
--- a/crates/core_simd/src/vector/int.rs
+++ b/crates/core_simd/src/vector/int.rs
@@ -1,103 +1,63 @@
 #![allow(non_camel_case_types)]
 
-use crate::simd::{LaneCount, Mask, Simd, SupportedLaneCount};
+use crate::simd::Simd;
 
-/// Implements additional integer traits (Eq, Ord, Hash) on the specified vector `$name`, holding multiple `$lanes` of `$type`.
-macro_rules! impl_integer_vector {
-    { $type:ty } => {
-        impl<const LANES: usize> Simd<$type, LANES>
-        where
-            LaneCount<LANES>: SupportedLaneCount,
-        {
-            /// Returns true for each positive lane and false if it is zero or negative.
-            #[inline]
-            pub fn is_positive(self) -> Mask<$type, LANES> {
-                self.lanes_gt(Self::splat(0))
-            }
-
-            /// Returns true for each negative lane and false if it is zero or positive.
-            #[inline]
-            pub fn is_negative(self) -> Mask<$type, LANES> {
-                self.lanes_lt(Self::splat(0))
-            }
-
-            /// Returns numbers representing the sign of each lane.
-            /// * `0` if the number is zero
-            /// * `1` if the number is positive
-            /// * `-1` if the number is negative
-            #[inline]
-            pub fn signum(self) -> Self {
-                self.is_positive().select(
-                    Self::splat(1),
-                    self.is_negative().select(Self::splat(-1), Self::splat(0))
-                )
-            }
-        }
-    }
-}
-
-impl_integer_vector! { isize }
-impl_integer_vector! { i16 }
-impl_integer_vector! { i32 }
-impl_integer_vector! { i64 }
-impl_integer_vector! { i8 }
-
-/// Vector of two `isize` values
+/// A SIMD vector with two elements of type `isize`.
 pub type isizex2 = Simd<isize, 2>;
 
-/// Vector of four `isize` values
+/// A SIMD vector with four elements of type `isize`.
 pub type isizex4 = Simd<isize, 4>;
 
-/// Vector of eight `isize` values
+/// A SIMD vector with eight elements of type `isize`.
 pub type isizex8 = Simd<isize, 8>;
 
-/// Vector of two `i16` values
+/// A 32-bit SIMD vector with two elements of type `i16`.
 pub type i16x2 = Simd<i16, 2>;
 
-/// Vector of four `i16` values
+/// A 64-bit SIMD vector with four elements of type `i16`.
 pub type i16x4 = Simd<i16, 4>;
 
-/// Vector of eight `i16` values
+/// A 128-bit SIMD vector with eight elements of type `i16`.
 pub type i16x8 = Simd<i16, 8>;
 
-/// Vector of 16 `i16` values
+/// A 256-bit SIMD vector with 16 elements of type `i16`.
 pub type i16x16 = Simd<i16, 16>;
 
-/// Vector of 32 `i16` values
+/// A 512-bit SIMD vector with 32 elements of type `i16`.
 pub type i16x32 = Simd<i16, 32>;
 
-/// Vector of two `i32` values
+/// A 64-bit SIMD vector with two elements of type `i32`.
 pub type i32x2 = Simd<i32, 2>;
 
-/// Vector of four `i32` values
+/// A 128-bit SIMD vector with four elements of type `i32`.
 pub type i32x4 = Simd<i32, 4>;
 
-/// Vector of eight `i32` values
+/// A 256-bit SIMD vector with eight elements of type `i32`.
 pub type i32x8 = Simd<i32, 8>;
 
-/// Vector of 16 `i32` values
+/// A 512-bit SIMD vector with 16 elements of type `i32`.
 pub type i32x16 = Simd<i32, 16>;
 
-/// Vector of two `i64` values
+/// A 128-bit SIMD vector with two elements of type `i64`.
 pub type i64x2 = Simd<i64, 2>;
 
-/// Vector of four `i64` values
+/// A 256-bit SIMD vector with four elements of type `i64`.
 pub type i64x4 = Simd<i64, 4>;
 
-/// Vector of eight `i64` values
+/// A 512-bit SIMD vector with eight elements of type `i64`.
 pub type i64x8 = Simd<i64, 8>;
 
-/// Vector of four `i8` values
+/// A 32-bit SIMD vector with four elements of type `i8`.
 pub type i8x4 = Simd<i8, 4>;
 
-/// Vector of eight `i8` values
+/// A 64-bit SIMD vector with eight elements of type `i8`.
 pub type i8x8 = Simd<i8, 8>;
 
-/// Vector of 16 `i8` values
+/// A 128-bit SIMD vector with 16 elements of type `i8`.
 pub type i8x16 = Simd<i8, 16>;
 
-/// Vector of 32 `i8` values
+/// A 256-bit SIMD vector with 32 elements of type `i8`.
 pub type i8x32 = Simd<i8, 32>;
 
-/// Vector of 64 `i8` values
+/// A 512-bit SIMD vector with 64 elements of type `i8`.
 pub type i8x64 = Simd<i8, 64>;
diff --git a/crates/core_simd/src/vector/uint.rs b/crates/core_simd/src/vector/uint.rs
index ed91fc3640e..b4a69c44363 100644
--- a/crates/core_simd/src/vector/uint.rs
+++ b/crates/core_simd/src/vector/uint.rs
@@ -2,62 +2,62 @@
 
 use crate::simd::Simd;
 
-/// Vector of two `usize` values
+/// A SIMD vector with two elements of type `usize`.
 pub type usizex2 = Simd<usize, 2>;
 
-/// Vector of four `usize` values
+/// A SIMD vector with four elements of type `usize`.
 pub type usizex4 = Simd<usize, 4>;
 
-/// Vector of eight `usize` values
+/// A SIMD vector with eight elements of type `usize`.
 pub type usizex8 = Simd<usize, 8>;
 
-/// Vector of two `u16` values
+/// A 32-bit SIMD vector with two elements of type `u16`.
 pub type u16x2 = Simd<u16, 2>;
 
-/// Vector of four `u16` values
+/// A 64-bit SIMD vector with four elements of type `u16`.
 pub type u16x4 = Simd<u16, 4>;
 
-/// Vector of eight `u16` values
+/// A 128-bit SIMD vector with eight elements of type `u16`.
 pub type u16x8 = Simd<u16, 8>;
 
-/// Vector of 16 `u16` values
+/// A 256-bit SIMD vector with 16 elements of type `u16`.
 pub type u16x16 = Simd<u16, 16>;
 
-/// Vector of 32 `u16` values
+/// A 512-bit SIMD vector with 32 elements of type `u16`.
 pub type u16x32 = Simd<u16, 32>;
 
-/// Vector of two `u32` values
+/// A 64-bit SIMD vector with two elements of type `u32`.
 pub type u32x2 = Simd<u32, 2>;
 
-/// Vector of four `u32` values
+/// A 128-bit SIMD vector with four elements of type `u32`.
 pub type u32x4 = Simd<u32, 4>;
 
-/// Vector of eight `u32` values
+/// A 256-bit SIMD vector with eight elements of type `u32`.
 pub type u32x8 = Simd<u32, 8>;
 
-/// Vector of 16 `u32` values
+/// A 512-bit SIMD vector with 16 elements of type `u32`.
 pub type u32x16 = Simd<u32, 16>;
 
-/// Vector of two `u64` values
+/// A 128-bit SIMD vector with two elements of type `u64`.
 pub type u64x2 = Simd<u64, 2>;
 
-/// Vector of four `u64` values
+/// A 256-bit SIMD vector with four elements of type `u64`.
 pub type u64x4 = Simd<u64, 4>;
 
-/// Vector of eight `u64` values
+/// A 512-bit SIMD vector with eight elements of type `u64`.
 pub type u64x8 = Simd<u64, 8>;
 
-/// Vector of four `u8` values
+/// A 32-bit SIMD vector with four elements of type `u8`.
 pub type u8x4 = Simd<u8, 4>;
 
-/// Vector of eight `u8` values
+/// A 64-bit SIMD vector with eight elements of type `u8`.
 pub type u8x8 = Simd<u8, 8>;
 
-/// Vector of 16 `u8` values
+/// A 128-bit SIMD vector with 16 elements of type `u8`.
 pub type u8x16 = Simd<u8, 16>;
 
-/// Vector of 32 `u8` values
+/// A 256-bit SIMD vector with 32 elements of type `u8`.
 pub type u8x32 = Simd<u8, 32>;
 
-/// Vector of 64 `u8` values
+/// A 512-bit SIMD vector with 64 elements of type `u8`.
 pub type u8x64 = Simd<u8, 64>;
diff --git a/crates/core_simd/tests/i16_ops.rs b/crates/core_simd/tests/i16_ops.rs
index 171e5b472fa..f6c5d74fbbc 100644
--- a/crates/core_simd/tests/i16_ops.rs
+++ b/crates/core_simd/tests/i16_ops.rs
@@ -1,32 +1,5 @@
 #![feature(portable_simd)]
-use core_simd::i16x2;
 
 #[macro_use]
 mod ops_macros;
 impl_signed_tests! { i16 }
-
-#[test]
-fn max_is_not_lexicographic() {
-    let a = i16x2::splat(10);
-    let b = i16x2::from_array([-4, 12]);
-    assert_eq!(a.max(b), i16x2::from_array([10, 12]));
-}
-
-#[test]
-fn min_is_not_lexicographic() {
-    let a = i16x2::splat(10);
-    let b = i16x2::from_array([12, -4]);
-    assert_eq!(a.min(b), i16x2::from_array([10, -4]));
-}
-
-#[test]
-fn clamp_is_not_lexicographic() {
-    let a = i16x2::splat(10);
-    let lo = i16x2::from_array([-12, -4]);
-    let up = i16x2::from_array([-4, 12]);
-    assert_eq!(a.clamp(lo, up), i16x2::from_array([-4, 10]));
-
-    let x = i16x2::from_array([1, 10]);
-    let y = x.clamp(i16x2::splat(0), i16x2::splat(9));
-    assert_eq!(y, i16x2::from_array([1, 9]));
-}
diff --git a/crates/core_simd/tests/masks.rs b/crates/core_simd/tests/masks.rs
index 3aec36ca7b7..673d0db93fe 100644
--- a/crates/core_simd/tests/masks.rs
+++ b/crates/core_simd/tests/masks.rs
@@ -80,6 +80,62 @@ macro_rules! test_mask_api {
                 assert_eq!(bitmask, 0b1000001101001001);
                 assert_eq!(core_simd::Mask::<$type, 16>::from_bitmask(bitmask), mask);
             }
+
+            #[test]
+            fn roundtrip_bitmask_conversion_short() {
+                use core_simd::ToBitMask;
+
+                let values = [
+                    false, false, false, true,
+                ];
+                let mask = core_simd::Mask::<$type, 4>::from_array(values);
+                let bitmask = mask.to_bitmask();
+                assert_eq!(bitmask, 0b1000);
+                assert_eq!(core_simd::Mask::<$type, 4>::from_bitmask(bitmask), mask);
+
+                let values = [true, false];
+                let mask = core_simd::Mask::<$type, 2>::from_array(values);
+                let bitmask = mask.to_bitmask();
+                assert_eq!(bitmask, 0b01);
+                assert_eq!(core_simd::Mask::<$type, 2>::from_bitmask(bitmask), mask);
+            }
+
+            #[test]
+            fn cast() {
+                fn cast_impl<T: core_simd::MaskElement>()
+                where
+                    core_simd::Mask<$type, 8>: Into<core_simd::Mask<T, 8>>,
+                {
+                    let values = [true, false, false, true, false, false, true, false];
+                    let mask = core_simd::Mask::<$type, 8>::from_array(values);
+
+                    let cast_mask = mask.cast::<T>();
+                    assert_eq!(values, cast_mask.to_array());
+
+                    let into_mask: core_simd::Mask<T, 8> = mask.into();
+                    assert_eq!(values, into_mask.to_array());
+                }
+
+                cast_impl::<i8>();
+                cast_impl::<i16>();
+                cast_impl::<i32>();
+                cast_impl::<i64>();
+                cast_impl::<isize>();
+            }
+
+            #[cfg(feature = "generic_const_exprs")]
+            #[test]
+            fn roundtrip_bitmask_array_conversion() {
+                use core_simd::ToBitMaskArray;
+                let values = [
+                    true, false, false, true, false, false, true, false,
+                    true, true, false, false, false, false, false, true,
+                ];
+                let mask = core_simd::Mask::<$type, 16>::from_array(values);
+                let bitmask = mask.to_bitmask_array();
+                assert_eq!(bitmask, [0b01001001, 0b10000011]);
+                assert_eq!(core_simd::Mask::<$type, 16>::from_bitmask_array(bitmask), mask);
+            }
         }
     }
 }
diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index 7c9b17673ef..f759394d075 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -172,6 +172,7 @@ macro_rules! impl_common_integer_tests {
 macro_rules! impl_signed_tests {
     { $scalar:tt } => {
         mod $scalar {
+            use core_simd::simd::SimdInt;
             type Vector<const LANES: usize> = core_simd::Simd<Scalar, LANES>;
             type Scalar = $scalar;
 
@@ -222,34 +223,37 @@ macro_rules! impl_signed_tests {
                     assert_eq!(a % b, Vector::<LANES>::splat(0));
                 }
 
-                fn min<const LANES: usize>() {
+                fn simd_min<const LANES: usize>() {
+                    use core_simd::simd::SimdOrd;
                     let a = Vector::<LANES>::splat(Scalar::MIN);
                     let b = Vector::<LANES>::splat(0);
-                    assert_eq!(a.min(b), a);
+                    assert_eq!(a.simd_min(b), a);
                     let a = Vector::<LANES>::splat(Scalar::MAX);
                     let b = Vector::<LANES>::splat(0);
-                    assert_eq!(a.min(b), b);
+                    assert_eq!(a.simd_min(b), b);
                 }
 
-                fn max<const LANES: usize>() {
+                fn simd_max<const LANES: usize>() {
+                    use core_simd::simd::SimdOrd;
                     let a = Vector::<LANES>::splat(Scalar::MIN);
                     let b = Vector::<LANES>::splat(0);
-                    assert_eq!(a.max(b), b);
+                    assert_eq!(a.simd_max(b), b);
                     let a = Vector::<LANES>::splat(Scalar::MAX);
                     let b = Vector::<LANES>::splat(0);
-                    assert_eq!(a.max(b), a);
+                    assert_eq!(a.simd_max(b), a);
                 }
 
-                fn clamp<const LANES: usize>() {
+                fn simd_clamp<const LANES: usize>() {
+                    use core_simd::simd::SimdOrd;
                     let min = Vector::<LANES>::splat(Scalar::MIN);
                     let max = Vector::<LANES>::splat(Scalar::MAX);
                     let zero = Vector::<LANES>::splat(0);
                     let one = Vector::<LANES>::splat(1);
                     let negone = Vector::<LANES>::splat(-1);
-                    assert_eq!(zero.clamp(min, max), zero);
-                    assert_eq!(zero.clamp(min, one), zero);
-                    assert_eq!(zero.clamp(one, max), one);
-                    assert_eq!(zero.clamp(min, negone), negone);
+                    assert_eq!(zero.simd_clamp(min, max), zero);
+                    assert_eq!(zero.simd_clamp(min, one), zero);
+                    assert_eq!(zero.simd_clamp(one, max), one);
+                    assert_eq!(zero.simd_clamp(min, negone), negone);
                 }
             }
 
@@ -309,6 +313,7 @@ macro_rules! impl_signed_tests {
 macro_rules! impl_unsigned_tests {
     { $scalar:tt } => {
         mod $scalar {
+            use core_simd::simd::SimdUint;
             type Vector<const LANES: usize> = core_simd::Simd<Scalar, LANES>;
             type Scalar = $scalar;
 
@@ -343,6 +348,7 @@ macro_rules! impl_unsigned_tests {
 macro_rules! impl_float_tests {
     { $scalar:tt, $int_scalar:tt } => {
         mod $scalar {
+            use core_simd::SimdFloat;
             type Vector<const LANES: usize> = core_simd::Simd<Scalar, LANES>;
             type Scalar = $scalar;
 
@@ -458,10 +464,10 @@ macro_rules! impl_float_tests {
                     )
                 }
 
-                fn min<const LANES: usize>() {
+                fn simd_min<const LANES: usize>() {
                     // Regular conditions (both values aren't zero)
                     test_helpers::test_binary_elementwise(
-                        &Vector::<LANES>::min,
+                        &Vector::<LANES>::simd_min,
                         &Scalar::min,
                         // Reject the case where both values are zero with different signs
                         &|a, b| {
@@ -477,14 +483,14 @@ macro_rules! impl_float_tests {
                     // Special case where both values are zero
                     let p_zero = Vector::<LANES>::splat(0.);
                     let n_zero = Vector::<LANES>::splat(-0.);
-                    assert!(p_zero.min(n_zero).to_array().iter().all(|x| *x == 0.));
-                    assert!(n_zero.min(p_zero).to_array().iter().all(|x| *x == 0.));
+                    assert!(p_zero.simd_min(n_zero).to_array().iter().all(|x| *x == 0.));
+                    assert!(n_zero.simd_min(p_zero).to_array().iter().all(|x| *x == 0.));
                 }
 
-                fn max<const LANES: usize>() {
+                fn simd_max<const LANES: usize>() {
                     // Regular conditions (both values aren't zero)
                     test_helpers::test_binary_elementwise(
-                        &Vector::<LANES>::max,
+                        &Vector::<LANES>::simd_max,
                         &Scalar::max,
                         // Reject the case where both values are zero with different signs
                         &|a, b| {
@@ -500,11 +506,11 @@ macro_rules! impl_float_tests {
                     // Special case where both values are zero
                     let p_zero = Vector::<LANES>::splat(0.);
                     let n_zero = Vector::<LANES>::splat(-0.);
-                    assert!(p_zero.max(n_zero).to_array().iter().all(|x| *x == 0.));
-                    assert!(n_zero.max(p_zero).to_array().iter().all(|x| *x == 0.));
+                    assert!(p_zero.simd_max(n_zero).to_array().iter().all(|x| *x == 0.));
+                    assert!(n_zero.simd_max(p_zero).to_array().iter().all(|x| *x == 0.));
                 }
 
-                fn clamp<const LANES: usize>() {
+                fn simd_clamp<const LANES: usize>() {
                     test_helpers::test_3(&|value: [Scalar; LANES], mut min: [Scalar; LANES], mut max: [Scalar; LANES]| {
                         for (min, max) in min.iter_mut().zip(max.iter_mut()) {
                             if max < min {
@@ -522,7 +528,7 @@ macro_rules! impl_float_tests {
                         for i in 0..LANES {
                             result_scalar[i] = value[i].clamp(min[i], max[i]);
                         }
-                        let result_vector = Vector::from_array(value).clamp(min.into(), max.into()).to_array();
+                        let result_vector = Vector::from_array(value).simd_clamp(min.into(), max.into()).to_array();
                         test_helpers::prop_assert_biteq!(result_scalar, result_vector);
                         Ok(())
                     })
diff --git a/crates/core_simd/tests/round.rs b/crates/core_simd/tests/round.rs
index 7feb0320a16..484fd5bf47d 100644
--- a/crates/core_simd/tests/round.rs
+++ b/crates/core_simd/tests/round.rs
@@ -59,7 +59,7 @@ macro_rules! float_rounding_test {
                     const MAX_REPRESENTABLE_VALUE: Scalar =
                         (ALL_MANTISSA_BITS << (core::mem::size_of::<Scalar>() * 8 - <Scalar>::MANTISSA_DIGITS as usize - 1)) as Scalar;
 
-                    let mut runner = proptest::test_runner::TestRunner::default();
+                    let mut runner = test_helpers::make_runner();
                     runner.run(
                         &test_helpers::array::UniformArrayStrategy::new(-MAX_REPRESENTABLE_VALUE..MAX_REPRESENTABLE_VALUE),
                         |x| {
diff --git a/crates/test_helpers/src/lib.rs b/crates/test_helpers/src/lib.rs
index 8bf7f5ed3d2..141bee18a9a 100644
--- a/crates/test_helpers/src/lib.rs
+++ b/crates/test_helpers/src/lib.rs
@@ -78,11 +78,11 @@ impl<T: core::fmt::Debug + DefaultStrategy, const LANES: usize> DefaultStrategy
 }
 
 #[cfg(not(miri))]
-fn make_runner() -> proptest::test_runner::TestRunner {
+pub fn make_runner() -> proptest::test_runner::TestRunner {
     Default::default()
 }
 #[cfg(miri)]
-fn make_runner() -> proptest::test_runner::TestRunner {
+pub fn make_runner() -> proptest::test_runner::TestRunner {
     // Only run a few tests on Miri
     proptest::test_runner::TestRunner::new(proptest::test_runner::Config::with_cases(4))
 }

From e2aec079eb4639d5c706a1aa6894d8f25edcaf37 Mon Sep 17 00:00:00 2001
From: Dezhi Wu <wu543065657@163.com>
Date: Thu, 18 Aug 2022 10:13:37 +0800
Subject: [PATCH 12/35] Fix a bunch of typo

This PR will fix some typos detected by [typos].

I only picked the ones I was sure were spelling errors to fix, mostly in
the comments.

[typos]: https://github.com/crate-ci/typos
---
 crates/core_simd/src/masks/to_bitmask.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/masks/to_bitmask.rs b/crates/core_simd/src/masks/to_bitmask.rs
index 65d3ce9be65..2235f016c71 100644
--- a/crates/core_simd/src/masks/to_bitmask.rs
+++ b/crates/core_simd/src/masks/to_bitmask.rs
@@ -70,7 +70,7 @@ impl_integer_intrinsic! {
     impl ToBitMask<BitMask=u64> for Mask<_, 64>
 }
 
-/// Returns the minimum numnber of bytes in a bitmask with `lanes` lanes.
+/// Returns the minimum number of bytes in a bitmask with `lanes` lanes.
 #[cfg(feature = "generic_const_exprs")]
 pub const fn bitmask_len(lanes: usize) -> usize {
     (lanes + 7) / 8

From 1a6a069365fea16d28d71ee12a723d88027140cd Mon Sep 17 00:00:00 2001
From: Takayuki Maeda <takoyaki0316@gmail.com>
Date: Thu, 1 Sep 2022 18:43:35 +0900
Subject: [PATCH 13/35] separate the receiver from arguments in HIR under
 /clippy

---
 crates/std_float/src/lib.rs | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/crates/std_float/src/lib.rs b/crates/std_float/src/lib.rs
index 4bd4d4c05e3..4ac60b10c92 100644
--- a/crates/std_float/src/lib.rs
+++ b/crates/std_float/src/lib.rs
@@ -1,9 +1,5 @@
 #![cfg_attr(feature = "as_crate", no_std)] // We are std!
-#![cfg_attr(
-    feature = "as_crate",
-    feature(platform_intrinsics),
-    feature(portable_simd)
-)]
+#![cfg_attr(feature = "as_crate", feature(platform_intrinsics), feature(portable_simd))]
 #[cfg(not(feature = "as_crate"))]
 use core::simd;
 #[cfg(feature = "as_crate")]

From 6afcb85219406bde837067a153cab813c64bab23 Mon Sep 17 00:00:00 2001
From: Mark Rousskov <mark.simulacrum@gmail.com>
Date: Wed, 2 Nov 2022 08:01:43 -0400
Subject: [PATCH 14/35] Fix rustdoc lints

---
 crates/core_simd/src/intrinsics.rs | 2 +-
 crates/core_simd/src/ops.rs        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 962c83a78cb..704e6ed0159 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -103,7 +103,7 @@ extern "platform-intrinsic" {
     /// val: vector of values to select if a lane is masked
     /// ptr: vector of pointers to read from
     /// mask: a "wide" mask of integers, selects as if simd_select(mask, read(ptr), val)
-    /// note, the LLVM intrinsic accepts a mask vector of <N x i1>
+    /// note, the LLVM intrinsic accepts a mask vector of `<N x i1>`
     /// FIXME: review this if/when we fix up our mask story in general?
     pub(crate) fn simd_gather<T, U, V>(val: T, ptr: U, mask: V) -> T;
     /// llvm.masked.scatter
diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index 5a077a469d8..fc1e0bc426d 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -40,7 +40,7 @@ macro_rules! unsafe_base {
 
 /// SAFETY: This macro should not be used for anything except Shl or Shr, and passed the appropriate shift intrinsic.
 /// It handles performing a bitand in addition to calling the shift operator, so that the result
-/// is well-defined: LLVM can return a poison value if you shl, lshr, or ashr if rhs >= <Int>::BITS
+/// is well-defined: LLVM can return a poison value if you shl, lshr, or ashr if `rhs >= <Int>::BITS`
 /// At worst, this will maybe add another instruction and cycle,
 /// at best, it may open up more optimization opportunities,
 /// or simply be elided entirely, especially for SIMD ISAs which default to this.

From 3cb40e51dc9092b717a764c4c230d3649afcf1d9 Mon Sep 17 00:00:00 2001
From: Nikolai Vazquez <hello@nikolaivazquez.com>
Date: Sat, 21 Jan 2023 22:00:25 -0500
Subject: [PATCH 15/35] Remove unnecessary `&format!`

These were likely from before the `PartialEq<str>` impl for `&String`.
---
 crates/core_simd/examples/spectral_norm.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/examples/spectral_norm.rs b/crates/core_simd/examples/spectral_norm.rs
index 012182e090b..d576bd0ccee 100644
--- a/crates/core_simd/examples/spectral_norm.rs
+++ b/crates/core_simd/examples/spectral_norm.rs
@@ -69,7 +69,7 @@ fn dot(x: &[f64], y: &[f64]) -> f64 {
 #[cfg(test)]
 #[test]
 fn test() {
-    assert_eq!(&format!("{:.9}", spectral_norm(100)), "1.274219991");
+    assert_eq!(format!("{:.9}", spectral_norm(100)), "1.274219991");
 }
 
 fn main() {

From ddcb68a94fe1e3da6247f2cb8f944a0494a80552 Mon Sep 17 00:00:00 2001
From: est31 <MTest31@outlook.com>
Date: Fri, 3 Mar 2023 02:35:10 +0100
Subject: [PATCH 16/35] Match unmatched backticks in library/

---
 crates/core_simd/src/vector.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 78f56402eba..d52d1ac4d3a 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -28,7 +28,7 @@ use crate::simd::{
 /// let zm_add = a0.zip(a1).map(|(lhs, rhs)| lhs + rhs);
 /// let zm_mul = a0.zip(a1).map(|(lhs, rhs)| lhs * rhs);
 ///
-/// // `Simd<T, N>` implements `From<[T; N]>
+/// // `Simd<T, N>` implements `From<[T; N]>`
 /// let (v0, v1) = (Simd::from(a0), Simd::from(a1));
 /// // Which means arrays implement `Into<Simd<T, N>>`.
 /// assert_eq!(v0 + v1, zm_add.into());

From 280657066a144645a4d42a5be84b25329f456a38 Mon Sep 17 00:00:00 2001
From: KaDiWa <kalle.wachsmuth@gmail.com>
Date: Tue, 9 Aug 2022 02:14:43 +0200
Subject: [PATCH 17/35] remove some unneeded imports

---
 crates/core_simd/src/masks/full_masks.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index adf0fcbeae2..b5ba198e504 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -257,7 +257,7 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::convert::From<Mask<T, LANES>> for Simd<T, LANES>
+impl<T, const LANES: usize> From<Mask<T, LANES>> for Simd<T, LANES>
 where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,

From a2cdcd53bffd1c21b5093b1001d0fbbaf2bbac2f Mon Sep 17 00:00:00 2001
From: Michael Goulet <michael@errs.io>
Date: Fri, 28 Apr 2023 17:36:49 +0000
Subject: [PATCH 18/35] Make sure that signatures aren't accidental refinements

---
 crates/core_simd/src/ops/deref.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/ops/deref.rs b/crates/core_simd/src/ops/deref.rs
index 9883a74c92d..302bf148bd3 100644
--- a/crates/core_simd/src/ops/deref.rs
+++ b/crates/core_simd/src/ops/deref.rs
@@ -71,7 +71,7 @@ macro_rules! deref_ops {
 
                 #[inline]
                 #[must_use = "operator returns a new vector without mutating the inputs"]
-                fn $call(self, rhs: &$simd) -> Self::Output {
+                fn $call(self, rhs: &'rhs $simd) -> Self::Output {
                     (*self).$call(*rhs)
                 }
             }

From a978408be9ec1800bee8eebaba736510e6d40255 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Thu, 11 May 2023 12:13:00 -0700
Subject: [PATCH 19/35] Sync portable-simd to 2023 May 10

Sync up to rust-lang/portable-simd@852762563aa890286eda2f668b8af30f8aa84216
---
 .github/workflows/ci.yml                      |   4 +
 README.md                                     |  34 +-
 crates/core_simd/Cargo.toml                   |   9 +-
 crates/core_simd/examples/README.md           |  13 +
 crates/core_simd/examples/dot_product.rs      | 169 +++++
 crates/core_simd/src/alias.rs                 | 227 ++++++
 crates/core_simd/src/cast.rs                  |  55 ++
 crates/core_simd/src/elements.rs              |   4 +
 crates/core_simd/src/elements/const_ptr.rs    | 141 ++++
 crates/core_simd/src/elements/mut_ptr.rs      | 136 ++++
 crates/core_simd/src/eq.rs                    |  38 +-
 crates/core_simd/src/fmt.rs                   |  50 +-
 crates/core_simd/src/intrinsics.rs            |  16 +-
 crates/core_simd/src/lane_count.rs            |  36 +-
 crates/core_simd/src/lib.rs                   |   8 +-
 crates/core_simd/src/masks.rs                 |  67 +-
 crates/core_simd/src/masks/bitmask.rs         |   4 +
 crates/core_simd/src/masks/full_masks.rs      |   4 +
 crates/core_simd/src/masks/to_bitmask.rs      |   4 +
 crates/core_simd/src/mod.rs                   |   6 +
 crates/core_simd/src/ord.rs                   | 102 ++-
 crates/core_simd/src/swizzle.rs               |  72 +-
 crates/core_simd/src/swizzle_dyn.rs           | 157 +++++
 crates/core_simd/src/vector.rs                | 665 +++++++++++++-----
 crates/core_simd/src/vector/float.rs          |  24 -
 crates/core_simd/src/vector/int.rs            |  63 --
 crates/core_simd/src/vector/ptr.rs            |  51 --
 crates/core_simd/src/vector/uint.rs           |  63 --
 crates/core_simd/tests/autoderef.rs           |   2 +-
 .../tests/mask_ops_impl/mask_macros.rs        |   2 +-
 crates/core_simd/tests/masks.rs               |  59 +-
 crates/core_simd/tests/ops_macros.rs          |  14 +-
 crates/core_simd/tests/pointers.rs            | 111 +++
 crates/core_simd/tests/round.rs               |   2 +-
 crates/core_simd/tests/swizzle.rs             |  16 +-
 crates/core_simd/tests/swizzle_dyn.rs         |  74 ++
 crates/core_simd/tests/to_bytes.rs            |   2 +-
 crates/core_simd/tests/try_from_slice.rs      |  25 +
 crates/test_helpers/Cargo.toml                |   3 +
 crates/test_helpers/src/array.rs              |   2 +
 crates/test_helpers/src/biteq.rs              |  20 +
 crates/test_helpers/src/lib.rs                | 338 ++++++---
 42 files changed, 2173 insertions(+), 719 deletions(-)
 create mode 100644 crates/core_simd/examples/README.md
 create mode 100644 crates/core_simd/examples/dot_product.rs
 create mode 100644 crates/core_simd/src/alias.rs
 create mode 100644 crates/core_simd/src/cast.rs
 create mode 100644 crates/core_simd/src/elements/const_ptr.rs
 create mode 100644 crates/core_simd/src/elements/mut_ptr.rs
 create mode 100644 crates/core_simd/src/swizzle_dyn.rs
 delete mode 100644 crates/core_simd/src/vector/float.rs
 delete mode 100644 crates/core_simd/src/vector/int.rs
 delete mode 100644 crates/core_simd/src/vector/ptr.rs
 delete mode 100644 crates/core_simd/src/vector/uint.rs
 create mode 100644 crates/core_simd/tests/pointers.rs
 create mode 100644 crates/core_simd/tests/swizzle_dyn.rs
 create mode 100644 crates/core_simd/tests/try_from_slice.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d50dfa1be4c..acd47a3da72 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -241,6 +241,10 @@ jobs:
           - "--features std"
           - "--features generic_const_exprs"
           - "--features std --features generic_const_exprs"
+          - "--features all_lane_counts"
+          - "--features all_lane_counts --features std"
+          - "--features all_lane_counts --features generic_const_exprs"
+          - "--features all_lane_counts --features std --features generic_const_exprs"
 
     steps:
       - uses: actions/checkout@v2
diff --git a/README.md b/README.md
index db0af2da606..e8ac600debe 100644
--- a/README.md
+++ b/README.md
@@ -24,19 +24,10 @@ or by setting up `rustup default nightly` or else with `cargo +nightly {build,te
 ```bash
 cargo new hellosimd
 ```
-to create a new crate. Edit `hellosimd/Cargo.toml` to be 
-```toml
-[package]
-name = "hellosimd"
-version = "0.1.0"
-edition = "2018"
-[dependencies]
-core_simd = { git = "https://github.com/rust-lang/portable-simd" }
-```
-
-and finally write this in `src/main.rs`:
+to create a new crate. Finally write this in `src/main.rs`:
 ```rust
-use core_simd::*;
+#![feature(portable_simd)]
+use std::simd::f32x4;
 fn main() {
     let a = f32x4::splat(10.0);
     let b = f32x4::from_array([1.0, 2.0, 3.0, 4.0]);
@@ -44,24 +35,23 @@ fn main() {
 }
 ```
 
-Explanation: We import all the bindings from the crate with the first line. Then, we construct our SIMD vectors with methods like `splat` or `from_array`. Finally, we can use operators on them like `+` and the appropriate SIMD instructions will be carried out. When we run `cargo run` you should get `[11.0, 12.0, 13.0, 14.0]`.
+Explanation: We construct our SIMD vectors with methods like `splat` or `from_array`. Next, we can use operators like `+` on them, and the appropriate SIMD instructions will be carried out. When we run `cargo run` you should get `[11.0, 12.0, 13.0, 14.0]`.
 
-## Code Organization
+## Supported vectors
 
-Currently the crate is organized so that each element type is a file, and then the 64-bit, 128-bit, 256-bit, and 512-bit vectors using those types are contained in said file.
-
-All types are then exported as a single, flat module.
+Currently, vectors may have up to 64 elements, but aliases are provided only up to 512-bit vectors.
 
 Depending on the size of the primitive type, the number of lanes the vector will have varies. For example, 128-bit vectors have four `f32` lanes and two `f64` lanes.
 
 The supported element types are as follows:
 * **Floating Point:** `f32`, `f64`
-* **Signed Integers:** `i8`, `i16`, `i32`, `i64`, `i128`, `isize`
-* **Unsigned Integers:** `u8`, `u16`, `u32`, `u64`, `u128`, `usize`
-* **Masks:** `mask8`, `mask16`, `mask32`, `mask64`, `mask128`, `masksize`
+* **Signed Integers:** `i8`, `i16`, `i32`, `i64`, `isize` (`i128` excluded)
+* **Unsigned Integers:** `u8`, `u16`, `u32`, `u64`, `usize` (`u128` excluded)
+* **Pointers:** `*const T` and `*mut T` (zero-sized metadata only)
+* **Masks:** 8-bit, 16-bit, 32-bit, 64-bit, and `usize`-sized masks
 
-Floating point, signed integers, and unsigned integers are the [primitive types](https://doc.rust-lang.org/core/primitive/index.html) you're already used to.
-The `mask` types are "truthy" values, but they use the number of bits in their name instead of just 1 bit like a normal `bool` uses.
+Floating point, signed integers, unsigned integers, and pointers are the [primitive types](https://doc.rust-lang.org/core/primitive/index.html) you're already used to.
+The mask types have elements that are "truthy" values, like `bool`, but have an unspecified layout because different architectures prefer different layouts for mask types.
 
 [simd-guide]: ./beginners-guide.md
 [zulip-project-portable-simd]: https://rust-lang.zulipchat.com/#narrow/stream/257879-project-portable-simd
diff --git a/crates/core_simd/Cargo.toml b/crates/core_simd/Cargo.toml
index 8a29cf15696..d1a3a515a7e 100644
--- a/crates/core_simd/Cargo.toml
+++ b/crates/core_simd/Cargo.toml
@@ -13,12 +13,11 @@ default = ["as_crate"]
 as_crate = []
 std = []
 generic_const_exprs = []
+all_lane_counts = []
 
-[target.'cfg(target_arch = "wasm32")'.dev-dependencies.wasm-bindgen]
-version = "0.2"
-
-[dev-dependencies.wasm-bindgen-test]
-version = "0.3"
+[target.'cfg(target_arch = "wasm32")'.dev-dependencies]
+wasm-bindgen = "0.2"
+wasm-bindgen-test = "0.3"
 
 [dev-dependencies.proptest]
 version = "0.10"
diff --git a/crates/core_simd/examples/README.md b/crates/core_simd/examples/README.md
new file mode 100644
index 00000000000..82747f1b5a6
--- /dev/null
+++ b/crates/core_simd/examples/README.md
@@ -0,0 +1,13 @@
+### `stdsimd` examples
+
+This crate is a port of example uses of `stdsimd`, mostly taken from the `packed_simd` crate.
+
+The examples contain, as in the case of `dot_product.rs`, multiple ways of solving the problem, in order to show idiomatic uses of SIMD and iteration of performance designs.
+
+Run the tests with the command 
+
+```
+cargo run --example dot_product
+```
+
+and verify the code for `dot_product.rs` on your machine.
diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
new file mode 100644
index 00000000000..391f08f55a0
--- /dev/null
+++ b/crates/core_simd/examples/dot_product.rs
@@ -0,0 +1,169 @@
+// Code taken from the `packed_simd` crate
+// Run this code with `cargo test --example dot_product`
+//use std::iter::zip;
+
+#![feature(array_chunks)]
+#![feature(slice_as_chunks)]
+// Add these imports to use the stdsimd library
+#![feature(portable_simd)]
+use core_simd::simd::*;
+
+// This is your barebones dot product implementation:
+// Take 2 vectors, multiply them element wise and *then*
+// go along the resulting array and add up the result.
+// In the next example we will see if there
+//  is any difference to adding and multiplying in tandem.
+pub fn dot_prod_scalar_0(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+
+    a.iter().zip(b.iter()).map(|(a, b)| a * b).sum()
+}
+
+// When dealing with SIMD, it is very important to think about the amount
+// of data movement and when it happens. We're going over simple computation examples here, and yet
+// it is not trivial to understand what may or may not contribute to performance
+// changes. Eventually, you will need tools to inspect the generated assembly and confirm your
+// hypothesis and benchmarks - we will mention them later on.
+// With the use of `fold`, we're doing a multiplication,
+// and then adding it to the sum, one element from both vectors at a time.
+pub fn dot_prod_scalar_1(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+    a.iter()
+        .zip(b.iter())
+        .fold(0.0, |a, zipped| a + zipped.0 * zipped.1)
+}
+
+// We now move on to the SIMD implementations: notice the following constructs:
+// `array_chunks::<4>`: mapping this over the vector will let use construct SIMD vectors
+// `f32x4::from_array`: construct the SIMD vector from a slice
+// `(a * b).reduce_sum()`: Multiply both f32x4 vectors together, and then reduce them.
+// This approach essentially uses SIMD to produce a vector of length N/4 of all the products,
+// and then add those with `sum()`. This is suboptimal.
+// TODO: ASCII diagrams
+pub fn dot_prod_simd_0(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+    // TODO handle remainder when a.len() % 4 != 0
+    a.array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .map(|(a, b)| (a * b).reduce_sum())
+        .sum()
+}
+
+// There's some simple ways to improve the previous code:
+// 1. Make a `zero` `f32x4` SIMD vector that we will be accumulating into
+// So that there is only one `sum()` reduction when the last `f32x4` has been processed
+// 2. Exploit Fused Multiply Add so that the multiplication, addition and sinking into the reduciton
+// happen in the same step.
+// If the arrays are large, minimizing the data shuffling will lead to great perf.
+// If the arrays are small, handling the remainder elements when the length isn't a multiple of 4
+// Can become a problem.
+pub fn dot_prod_simd_1(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+    // TODO handle remainder when a.len() % 4 != 0
+    a.array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .fold(f32x4::splat(0.0), |acc, zipped| acc + zipped.0 * zipped.1)
+        .reduce_sum()
+}
+
+// A lot of knowledgeable use of SIMD comes from knowing specific instructions that are
+// available - let's try to use the `mul_add` instruction, which is the fused-multiply-add we were looking for.
+use std_float::StdFloat;
+pub fn dot_prod_simd_2(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+    // TODO handle remainder when a.len() % 4 != 0
+    let mut res = f32x4::splat(0.0);
+    a.array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .for_each(|(a, b)| {
+            res = a.mul_add(b, res);
+        });
+    res.reduce_sum()
+}
+
+// Finally, we will write the same operation but handling the loop remainder.
+const LANES: usize = 4;
+pub fn dot_prod_simd_3(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+
+    let (a_extra, a_chunks) = a.as_rchunks();
+    let (b_extra, b_chunks) = b.as_rchunks();
+
+    // These are always true, but for emphasis:
+    assert_eq!(a_chunks.len(), b_chunks.len());
+    assert_eq!(a_extra.len(), b_extra.len());
+
+    let mut sums = [0.0; LANES];
+    for ((x, y), d) in std::iter::zip(a_extra, b_extra).zip(&mut sums) {
+        *d = x * y;
+    }
+
+    let mut sums = f32x4::from_array(sums);
+    std::iter::zip(a_chunks, b_chunks).for_each(|(x, y)| {
+        sums += f32x4::from_array(*x) * f32x4::from_array(*y);
+    });
+
+    sums.reduce_sum()
+}
+
+// Finally, we present an iterator version for handling remainders in a scalar fashion at the end of the loop.
+// Unfortunately, this is allocating 1 `XMM` register on the order of `~len(a)` - we'll see how we can get around it in the
+// next example.
+pub fn dot_prod_simd_4(a: &[f32], b: &[f32]) -> f32 {
+    let mut sum = a
+        .array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .map(|(a, b)| a * b)
+        .fold(f32x4::splat(0.0), std::ops::Add::add)
+        .reduce_sum();
+    let remain = a.len() - (a.len() % 4);
+    sum += a[remain..]
+        .iter()
+        .zip(&b[remain..])
+        .map(|(a, b)| a * b)
+        .sum::<f32>();
+    sum
+}
+
+// This version allocates a single `XMM` register for accumulation, and the folds don't allocate on top of that.
+// Notice the the use of `mul_add`, which can do a multiply and an add operation ber iteration.
+pub fn dot_prod_simd_5(a: &[f32], b: &[f32]) -> f32 {
+    a.array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .fold(f32x4::splat(0.), |acc, (a, b)| a.mul_add(b, acc))
+        .reduce_sum()
+}
+
+fn main() {
+    // Empty main to make cargo happy
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn smoke_test() {
+        use super::*;
+        let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let b: Vec<f32> = vec![-8.0, -7.0, -6.0, -5.0, 4.0, 3.0, 2.0, 1.0];
+        let x: Vec<f32> = [0.5; 1003].to_vec();
+        let y: Vec<f32> = [2.0; 1003].to_vec();
+
+        // Basic check
+        assert_eq!(0.0, dot_prod_scalar_0(&a, &b));
+        assert_eq!(0.0, dot_prod_scalar_1(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_0(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_1(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_2(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_3(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_4(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_5(&a, &b));
+
+        // We can handle vectors that are non-multiples of 4
+        assert_eq!(1003.0, dot_prod_simd_3(&x, &y));
+    }
+}
diff --git a/crates/core_simd/src/alias.rs b/crates/core_simd/src/alias.rs
new file mode 100644
index 00000000000..23f121c4619
--- /dev/null
+++ b/crates/core_simd/src/alias.rs
@@ -0,0 +1,227 @@
+macro_rules! number {
+    { 1 } => { "one" };
+    { 2 } => { "two" };
+    { 4 } => { "four" };
+    { 8 } => { "eight" };
+    { $x:literal } => { stringify!($x) };
+}
+
+macro_rules! plural {
+    { 1 } => { "" };
+    { $x:literal } => { "s" };
+}
+
+macro_rules! alias {
+    {
+        $(
+            $element_ty:ty = {
+                $($alias:ident $num_elements:tt)*
+            }
+        )*
+    } => {
+        $(
+            $(
+            #[doc = concat!("A SIMD vector with ", number!($num_elements), " element", plural!($num_elements), " of type [`", stringify!($element_ty), "`].")]
+            #[allow(non_camel_case_types)]
+            pub type $alias = $crate::simd::Simd<$element_ty, $num_elements>;
+            )*
+        )*
+    }
+}
+
+macro_rules! mask_alias {
+    {
+        $(
+            $element_ty:ty : $size:literal = {
+                $($alias:ident $num_elements:tt)*
+            }
+        )*
+    } => {
+        $(
+            $(
+            #[doc = concat!("A SIMD mask with ", number!($num_elements), " element", plural!($num_elements), " for vectors with ", $size, " element types.")]
+            ///
+            #[doc = concat!(
+                "The layout of this type is unspecified, and may change between platforms and/or Rust versions, and code should not assume that it is equivalent to `[",
+                stringify!($element_ty), "; ", $num_elements, "]`."
+            )]
+            #[allow(non_camel_case_types)]
+            pub type $alias = $crate::simd::Mask<$element_ty, $num_elements>;
+            )*
+        )*
+    }
+}
+
+alias! {
+    i8 = {
+        i8x1 1
+        i8x2 2
+        i8x4 4
+        i8x8 8
+        i8x16 16
+        i8x32 32
+        i8x64 64
+    }
+
+    i16 = {
+        i16x1 1
+        i16x2 2
+        i16x4 4
+        i16x8 8
+        i16x16 16
+        i16x32 32
+        i16x64 64
+    }
+
+    i32 = {
+        i32x1 1
+        i32x2 2
+        i32x4 4
+        i32x8 8
+        i32x16 16
+        i32x32 32
+        i32x64 64
+    }
+
+    i64 = {
+        i64x1 1
+        i64x2 2
+        i64x4 4
+        i64x8 8
+        i64x16 16
+        i64x32 32
+        i64x64 64
+    }
+
+    isize = {
+        isizex1 1
+        isizex2 2
+        isizex4 4
+        isizex8 8
+        isizex16 16
+        isizex32 32
+        isizex64 64
+    }
+
+    u8 = {
+        u8x1 1
+        u8x2 2
+        u8x4 4
+        u8x8 8
+        u8x16 16
+        u8x32 32
+        u8x64 64
+    }
+
+    u16 = {
+        u16x1 1
+        u16x2 2
+        u16x4 4
+        u16x8 8
+        u16x16 16
+        u16x32 32
+        u16x64 64
+    }
+
+    u32 = {
+        u32x1 1
+        u32x2 2
+        u32x4 4
+        u32x8 8
+        u32x16 16
+        u32x32 32
+        u32x64 64
+    }
+
+    u64 = {
+        u64x1 1
+        u64x2 2
+        u64x4 4
+        u64x8 8
+        u64x16 16
+        u64x32 32
+        u64x64 64
+    }
+
+    usize = {
+        usizex1 1
+        usizex2 2
+        usizex4 4
+        usizex8 8
+        usizex16 16
+        usizex32 32
+        usizex64 64
+    }
+
+    f32 = {
+        f32x1 1
+        f32x2 2
+        f32x4 4
+        f32x8 8
+        f32x16 16
+        f32x32 32
+        f32x64 64
+    }
+
+    f64 = {
+        f64x1 1
+        f64x2 2
+        f64x4 4
+        f64x8 8
+        f64x16 16
+        f64x32 32
+        f64x64 64
+    }
+}
+
+mask_alias! {
+    i8 : "8-bit" = {
+        mask8x1 1
+        mask8x2 2
+        mask8x4 4
+        mask8x8 8
+        mask8x16 16
+        mask8x32 32
+        mask8x64 64
+    }
+
+    i16 : "16-bit" = {
+        mask16x1 1
+        mask16x2 2
+        mask16x4 4
+        mask16x8 8
+        mask16x16 16
+        mask16x32 32
+        mask16x64 64
+    }
+
+    i32 : "32-bit" = {
+        mask32x1 1
+        mask32x2 2
+        mask32x4 4
+        mask32x8 8
+        mask32x16 16
+        mask32x32 32
+        mask32x64 64
+    }
+
+    i64 : "64-bit" = {
+        mask64x1 1
+        mask64x2 2
+        mask64x4 4
+        mask64x8 8
+        mask64x16 16
+        mask64x32 32
+        mask64x64 64
+    }
+
+    isize : "pointer-sized" = {
+        masksizex1 1
+        masksizex2 2
+        masksizex4 4
+        masksizex8 8
+        masksizex16 16
+        masksizex32 32
+        masksizex64 64
+    }
+}
diff --git a/crates/core_simd/src/cast.rs b/crates/core_simd/src/cast.rs
new file mode 100644
index 00000000000..65a3f845ffc
--- /dev/null
+++ b/crates/core_simd/src/cast.rs
@@ -0,0 +1,55 @@
+use crate::simd::SimdElement;
+
+/// Supporting trait for `Simd::cast`.  Typically doesn't need to be used directly.
+///
+/// # Safety
+/// Implementing this trait asserts that the type is a valid vector element for the `simd_cast` or
+/// `simd_as` intrinsics.
+pub unsafe trait SimdCast: SimdElement {}
+
+// Safety: primitive number types can be cast to other primitive number types
+unsafe impl SimdCast for i8 {}
+// Safety: primitive number types can be cast to other primitive number types
+unsafe impl SimdCast for i16 {}
+// Safety: primitive number types can be cast to other primitive number types
+unsafe impl SimdCast for i32 {}
+// Safety: primitive number types can be cast to other primitive number types
+unsafe impl SimdCast for i64 {}
+// Safety: primitive number types can be cast to other primitive number types
+unsafe impl SimdCast for isize {}
+// Safety: primitive number types can be cast to other primitive number types
+unsafe impl SimdCast for u8 {}
+// Safety: primitive number types can be cast to other primitive number types
+unsafe impl SimdCast for u16 {}
+// Safety: primitive number types can be cast to other primitive number types
+unsafe impl SimdCast for u32 {}
+// Safety: primitive number types can be cast to other primitive number types
+unsafe impl SimdCast for u64 {}
+// Safety: primitive number types can be cast to other primitive number types
+unsafe impl SimdCast for usize {}
+// Safety: primitive number types can be cast to other primitive number types
+unsafe impl SimdCast for f32 {}
+// Safety: primitive number types can be cast to other primitive number types
+unsafe impl SimdCast for f64 {}
+
+/// Supporting trait for `Simd::cast_ptr`.  Typically doesn't need to be used directly.
+///
+/// # Safety
+/// Implementing this trait asserts that the type is a valid vector element for the `simd_cast_ptr`
+/// intrinsic.
+pub unsafe trait SimdCastPtr<T> {}
+
+// Safety: pointers can be cast to other pointer types
+unsafe impl<T, U> SimdCastPtr<T> for *const U
+where
+    U: core::ptr::Pointee,
+    T: core::ptr::Pointee<Metadata = U::Metadata>,
+{
+}
+// Safety: pointers can be cast to other pointer types
+unsafe impl<T, U> SimdCastPtr<T> for *mut U
+where
+    U: core::ptr::Pointee,
+    T: core::ptr::Pointee<Metadata = U::Metadata>,
+{
+}
diff --git a/crates/core_simd/src/elements.rs b/crates/core_simd/src/elements.rs
index 701eb66b248..dc7f52a4d57 100644
--- a/crates/core_simd/src/elements.rs
+++ b/crates/core_simd/src/elements.rs
@@ -1,11 +1,15 @@
+mod const_ptr;
 mod float;
 mod int;
+mod mut_ptr;
 mod uint;
 
 mod sealed {
     pub trait Sealed {}
 }
 
+pub use const_ptr::*;
 pub use float::*;
 pub use int::*;
+pub use mut_ptr::*;
 pub use uint::*;
diff --git a/crates/core_simd/src/elements/const_ptr.rs b/crates/core_simd/src/elements/const_ptr.rs
new file mode 100644
index 00000000000..0ef9802b5e2
--- /dev/null
+++ b/crates/core_simd/src/elements/const_ptr.rs
@@ -0,0 +1,141 @@
+use super::sealed::Sealed;
+use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
+
+/// Operations on SIMD vectors of constant pointers.
+pub trait SimdConstPtr: Copy + Sealed {
+    /// Vector of `usize` with the same number of lanes.
+    type Usize;
+
+    /// Vector of `isize` with the same number of lanes.
+    type Isize;
+
+    /// Vector of mutable pointers to the same type.
+    type MutPtr;
+
+    /// Mask type used for manipulating this SIMD vector type.
+    type Mask;
+
+    /// Returns `true` for each lane that is null.
+    fn is_null(self) -> Self::Mask;
+
+    /// Changes constness without changing the type.
+    ///
+    /// Equivalent to calling [`pointer::cast_mut`] on each lane.
+    fn cast_mut(self) -> Self::MutPtr;
+
+    /// Gets the "address" portion of the pointer.
+    ///
+    /// This method discards pointer semantic metadata, so the result cannot be
+    /// directly cast into a valid pointer.
+    ///
+    /// This method semantically discards *provenance* and
+    /// *address-space* information. To properly restore that information, use [`Self::with_addr`].
+    ///
+    /// Equivalent to calling [`pointer::addr`] on each lane.
+    fn addr(self) -> Self::Usize;
+
+    /// Creates a new pointer with the given address.
+    ///
+    /// This performs the same operation as a cast, but copies the *address-space* and
+    /// *provenance* of `self` to the new pointer.
+    ///
+    /// Equivalent to calling [`pointer::with_addr`] on each lane.
+    fn with_addr(self, addr: Self::Usize) -> Self;
+
+    /// Gets the "address" portion of the pointer, and "exposes" the provenance part for future use
+    /// in [`Self::from_exposed_addr`].
+    fn expose_addr(self) -> Self::Usize;
+
+    /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
+    ///
+    /// Equivalent to calling [`core::ptr::from_exposed_addr`] on each lane.
+    fn from_exposed_addr(addr: Self::Usize) -> Self;
+
+    /// Calculates the offset from a pointer using wrapping arithmetic.
+    ///
+    /// Equivalent to calling [`pointer::wrapping_offset`] on each lane.
+    fn wrapping_offset(self, offset: Self::Isize) -> Self;
+
+    /// Calculates the offset from a pointer using wrapping arithmetic.
+    ///
+    /// Equivalent to calling [`pointer::wrapping_add`] on each lane.
+    fn wrapping_add(self, count: Self::Usize) -> Self;
+
+    /// Calculates the offset from a pointer using wrapping arithmetic.
+    ///
+    /// Equivalent to calling [`pointer::wrapping_sub`] on each lane.
+    fn wrapping_sub(self, count: Self::Usize) -> Self;
+}
+
+impl<T, const LANES: usize> Sealed for Simd<*const T, LANES> where
+    LaneCount<LANES>: SupportedLaneCount
+{
+}
+
+impl<T, const LANES: usize> SimdConstPtr for Simd<*const T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Usize = Simd<usize, LANES>;
+    type Isize = Simd<isize, LANES>;
+    type MutPtr = Simd<*mut T, LANES>;
+    type Mask = Mask<isize, LANES>;
+
+    #[inline]
+    fn is_null(self) -> Self::Mask {
+        Simd::splat(core::ptr::null()).simd_eq(self)
+    }
+
+    #[inline]
+    fn cast_mut(self) -> Self::MutPtr {
+        self.cast_ptr()
+    }
+
+    #[inline]
+    fn addr(self) -> Self::Usize {
+        // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
+        // SAFETY: Pointer-to-integer transmutes are valid (if you are okay with losing the
+        // provenance).
+        unsafe { core::mem::transmute_copy(&self) }
+    }
+
+    #[inline]
+    fn with_addr(self, addr: Self::Usize) -> Self {
+        // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
+        //
+        // In the mean-time, this operation is defined to be "as if" it was
+        // a wrapping_offset, so we can emulate it as such. This should properly
+        // restore pointer provenance even under today's compiler.
+        self.cast_ptr::<*const u8>()
+            .wrapping_offset(addr.cast::<isize>() - self.addr().cast::<isize>())
+            .cast_ptr()
+    }
+
+    #[inline]
+    fn expose_addr(self) -> Self::Usize {
+        // Safety: `self` is a pointer vector
+        unsafe { intrinsics::simd_expose_addr(self) }
+    }
+
+    #[inline]
+    fn from_exposed_addr(addr: Self::Usize) -> Self {
+        // Safety: `self` is a pointer vector
+        unsafe { intrinsics::simd_from_exposed_addr(addr) }
+    }
+
+    #[inline]
+    fn wrapping_offset(self, count: Self::Isize) -> Self {
+        // Safety: simd_arith_offset takes a vector of pointers and a vector of offsets
+        unsafe { intrinsics::simd_arith_offset(self, count) }
+    }
+
+    #[inline]
+    fn wrapping_add(self, count: Self::Usize) -> Self {
+        self.wrapping_offset(count.cast())
+    }
+
+    #[inline]
+    fn wrapping_sub(self, count: Self::Usize) -> Self {
+        self.wrapping_offset(-count.cast::<isize>())
+    }
+}
diff --git a/crates/core_simd/src/elements/mut_ptr.rs b/crates/core_simd/src/elements/mut_ptr.rs
new file mode 100644
index 00000000000..d87986b4a09
--- /dev/null
+++ b/crates/core_simd/src/elements/mut_ptr.rs
@@ -0,0 +1,136 @@
+use super::sealed::Sealed;
+use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
+
+/// Operations on SIMD vectors of mutable pointers.
+pub trait SimdMutPtr: Copy + Sealed {
+    /// Vector of `usize` with the same number of lanes.
+    type Usize;
+
+    /// Vector of `isize` with the same number of lanes.
+    type Isize;
+
+    /// Vector of constant pointers to the same type.
+    type ConstPtr;
+
+    /// Mask type used for manipulating this SIMD vector type.
+    type Mask;
+
+    /// Returns `true` for each lane that is null.
+    fn is_null(self) -> Self::Mask;
+
+    /// Changes constness without changing the type.
+    ///
+    /// Equivalent to calling [`pointer::cast_const`] on each lane.
+    fn cast_const(self) -> Self::ConstPtr;
+
+    /// Gets the "address" portion of the pointer.
+    ///
+    /// This method discards pointer semantic metadata, so the result cannot be
+    /// directly cast into a valid pointer.
+    ///
+    /// Equivalent to calling [`pointer::addr`] on each lane.
+    fn addr(self) -> Self::Usize;
+
+    /// Creates a new pointer with the given address.
+    ///
+    /// This performs the same operation as a cast, but copies the *address-space* and
+    /// *provenance* of `self` to the new pointer.
+    ///
+    /// Equivalent to calling [`pointer::with_addr`] on each lane.
+    fn with_addr(self, addr: Self::Usize) -> Self;
+
+    /// Gets the "address" portion of the pointer, and "exposes" the provenance part for future use
+    /// in [`Self::from_exposed_addr`].
+    fn expose_addr(self) -> Self::Usize;
+
+    /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
+    ///
+    /// Equivalent to calling [`core::ptr::from_exposed_addr_mut`] on each lane.
+    fn from_exposed_addr(addr: Self::Usize) -> Self;
+
+    /// Calculates the offset from a pointer using wrapping arithmetic.
+    ///
+    /// Equivalent to calling [`pointer::wrapping_offset`] on each lane.
+    fn wrapping_offset(self, offset: Self::Isize) -> Self;
+
+    /// Calculates the offset from a pointer using wrapping arithmetic.
+    ///
+    /// Equivalent to calling [`pointer::wrapping_add`] on each lane.
+    fn wrapping_add(self, count: Self::Usize) -> Self;
+
+    /// Calculates the offset from a pointer using wrapping arithmetic.
+    ///
+    /// Equivalent to calling [`pointer::wrapping_sub`] on each lane.
+    fn wrapping_sub(self, count: Self::Usize) -> Self;
+}
+
+impl<T, const LANES: usize> Sealed for Simd<*mut T, LANES> where LaneCount<LANES>: SupportedLaneCount
+{}
+
+impl<T, const LANES: usize> SimdMutPtr for Simd<*mut T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Usize = Simd<usize, LANES>;
+    type Isize = Simd<isize, LANES>;
+    type ConstPtr = Simd<*const T, LANES>;
+    type Mask = Mask<isize, LANES>;
+
+    #[inline]
+    fn is_null(self) -> Self::Mask {
+        Simd::splat(core::ptr::null_mut()).simd_eq(self)
+    }
+
+    #[inline]
+    fn cast_const(self) -> Self::ConstPtr {
+        self.cast_ptr()
+    }
+
+    #[inline]
+    fn addr(self) -> Self::Usize {
+        // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
+        // SAFETY: Pointer-to-integer transmutes are valid (if you are okay with losing the
+        // provenance).
+        unsafe { core::mem::transmute_copy(&self) }
+    }
+
+    #[inline]
+    fn with_addr(self, addr: Self::Usize) -> Self {
+        // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
+        //
+        // In the mean-time, this operation is defined to be "as if" it was
+        // a wrapping_offset, so we can emulate it as such. This should properly
+        // restore pointer provenance even under today's compiler.
+        self.cast_ptr::<*mut u8>()
+            .wrapping_offset(addr.cast::<isize>() - self.addr().cast::<isize>())
+            .cast_ptr()
+    }
+
+    #[inline]
+    fn expose_addr(self) -> Self::Usize {
+        // Safety: `self` is a pointer vector
+        unsafe { intrinsics::simd_expose_addr(self) }
+    }
+
+    #[inline]
+    fn from_exposed_addr(addr: Self::Usize) -> Self {
+        // Safety: `self` is a pointer vector
+        unsafe { intrinsics::simd_from_exposed_addr(addr) }
+    }
+
+    #[inline]
+    fn wrapping_offset(self, count: Self::Isize) -> Self {
+        // Safety: simd_arith_offset takes a vector of pointers and a vector of offsets
+        unsafe { intrinsics::simd_arith_offset(self, count) }
+    }
+
+    #[inline]
+    fn wrapping_add(self, count: Self::Usize) -> Self {
+        self.wrapping_offset(count.cast())
+    }
+
+    #[inline]
+    fn wrapping_sub(self, count: Self::Usize) -> Self {
+        self.wrapping_offset(-count.cast::<isize>())
+    }
+}
diff --git a/crates/core_simd/src/eq.rs b/crates/core_simd/src/eq.rs
index c7111f720a8..80763c07272 100644
--- a/crates/core_simd/src/eq.rs
+++ b/crates/core_simd/src/eq.rs
@@ -1,4 +1,6 @@
-use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdElement, SupportedLaneCount};
+use crate::simd::{
+    intrinsics, LaneCount, Mask, Simd, SimdConstPtr, SimdElement, SimdMutPtr, SupportedLaneCount,
+};
 
 /// Parallel `PartialEq`.
 pub trait SimdPartialEq {
@@ -71,3 +73,37 @@ macro_rules! impl_mask {
 }
 
 impl_mask! { i8, i16, i32, i64, isize }
+
+impl<T, const LANES: usize> SimdPartialEq for Simd<*const T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Mask = Mask<isize, LANES>;
+
+    #[inline]
+    fn simd_eq(self, other: Self) -> Self::Mask {
+        self.addr().simd_eq(other.addr())
+    }
+
+    #[inline]
+    fn simd_ne(self, other: Self) -> Self::Mask {
+        self.addr().simd_ne(other.addr())
+    }
+}
+
+impl<T, const LANES: usize> SimdPartialEq for Simd<*mut T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Mask = Mask<isize, LANES>;
+
+    #[inline]
+    fn simd_eq(self, other: Self) -> Self::Mask {
+        self.addr().simd_eq(other.addr())
+    }
+
+    #[inline]
+    fn simd_ne(self, other: Self) -> Self::Mask {
+        self.addr().simd_ne(other.addr())
+    }
+}
diff --git a/crates/core_simd/src/fmt.rs b/crates/core_simd/src/fmt.rs
index dbd9839c4bf..b7317969cbb 100644
--- a/crates/core_simd/src/fmt.rs
+++ b/crates/core_simd/src/fmt.rs
@@ -1,39 +1,21 @@
 use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
 use core::fmt;
 
-macro_rules! impl_fmt_trait {
-    { $($trait:ident,)* } => {
-        $(
-            impl<T, const LANES: usize> fmt::$trait for Simd<T, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-                T: SimdElement + fmt::$trait,
-            {
-                fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-                    #[repr(transparent)]
-                    struct Wrapper<'a, T: fmt::$trait>(&'a T);
-
-                    impl<T: fmt::$trait> fmt::Debug for Wrapper<'_, T> {
-                        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-                            self.0.fmt(f)
-                        }
-                    }
-
-                    f.debug_list()
-                        .entries(self.as_array().iter().map(|x| Wrapper(x)))
-                        .finish()
-                }
-            }
-        )*
+impl<T, const LANES: usize> fmt::Debug for Simd<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement + fmt::Debug,
+{
+    /// A `Simd<T, N>` has a debug format like the one for `[T]`:
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd::Simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd::Simd;
+    /// let floats = Simd::<f32, 4>::splat(-1.0);
+    /// assert_eq!(format!("{:?}", [-1.0; 4]), format!("{:?}", floats));
+    /// ```
+    #[inline]
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        <[T] as fmt::Debug>::fmt(self.as_array(), f)
     }
 }
-
-impl_fmt_trait! {
-    Debug,
-    Binary,
-    LowerExp,
-    UpperExp,
-    Octal,
-    LowerHex,
-    UpperHex,
-}
diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 704e6ed0159..dd6698e2ba5 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -61,9 +61,6 @@ extern "platform-intrinsic" {
     /// xor
     pub(crate) fn simd_xor<T>(x: T, y: T) -> T;
 
-    /// getelementptr (without inbounds)
-    pub(crate) fn simd_arith_offset<T, U>(ptrs: T, offsets: U) -> T;
-
     /// fptoui/fptosi/uitofp/sitofp
     /// casting floats to integers is truncating, so it is safe to convert values like e.g. 1.5
     /// but the truncated value must fit in the target type or the result is poison.
@@ -150,4 +147,17 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_select<M, T>(m: M, yes: T, no: T) -> T;
     #[allow(unused)]
     pub(crate) fn simd_select_bitmask<M, T>(m: M, yes: T, no: T) -> T;
+
+    /// getelementptr (without inbounds)
+    /// equivalent to wrapping_offset
+    pub(crate) fn simd_arith_offset<T, U>(ptr: T, offset: U) -> T;
+
+    /// equivalent to `T as U` semantics, specifically for pointers
+    pub(crate) fn simd_cast_ptr<T, U>(ptr: T) -> U;
+
+    /// expose a pointer as an address
+    pub(crate) fn simd_expose_addr<T, U>(ptr: T) -> U;
+
+    /// convert an exposed address back to a pointer
+    pub(crate) fn simd_from_exposed_addr<T, U>(addr: T) -> U;
 }
diff --git a/crates/core_simd/src/lane_count.rs b/crates/core_simd/src/lane_count.rs
index 63723e2ec13..2b91eb9e800 100644
--- a/crates/core_simd/src/lane_count.rs
+++ b/crates/core_simd/src/lane_count.rs
@@ -23,24 +23,20 @@ pub trait SupportedLaneCount: Sealed {
 
 impl<const LANES: usize> Sealed for LaneCount<LANES> {}
 
-impl SupportedLaneCount for LaneCount<1> {
-    type BitMask = [u8; 1];
-}
-impl SupportedLaneCount for LaneCount<2> {
-    type BitMask = [u8; 1];
-}
-impl SupportedLaneCount for LaneCount<4> {
-    type BitMask = [u8; 1];
-}
-impl SupportedLaneCount for LaneCount<8> {
-    type BitMask = [u8; 1];
-}
-impl SupportedLaneCount for LaneCount<16> {
-    type BitMask = [u8; 2];
-}
-impl SupportedLaneCount for LaneCount<32> {
-    type BitMask = [u8; 4];
-}
-impl SupportedLaneCount for LaneCount<64> {
-    type BitMask = [u8; 8];
+macro_rules! supported_lane_count {
+    ($($lanes:literal),+) => {
+        $(
+            impl SupportedLaneCount for LaneCount<$lanes> {
+                type BitMask = [u8; ($lanes + 7) / 8];
+            }
+        )+
+    };
 }
+
+supported_lane_count!(1, 2, 4, 8, 16, 32, 64);
+#[cfg(feature = "all_lane_counts")]
+supported_lane_count!(
+    3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+    31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+    56, 57, 58, 59, 60, 61, 62, 63
+);
diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 715f258f617..e5307de2155 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -1,5 +1,8 @@
 #![no_std]
 #![feature(
+    const_refs_to_cell,
+    const_maybe_uninit_as_mut_ptr,
+    const_mut_refs,
     convert_float_to_int,
     decl_macro,
     intra_doc_pointers,
@@ -7,7 +10,9 @@
     repr_simd,
     simd_ffi,
     staged_api,
-    stdsimd
+    stdsimd,
+    strict_provenance,
+    ptr_metadata
 )]
 #![cfg_attr(feature = "generic_const_exprs", feature(generic_const_exprs))]
 #![cfg_attr(feature = "generic_const_exprs", allow(incomplete_features))]
@@ -19,4 +24,3 @@
 #[path = "mod.rs"]
 mod core_simd;
 pub use self::core_simd::simd;
-pub use simd::*;
diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index c36c336d8a2..e58df80fca8 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -55,6 +55,7 @@ pub unsafe trait MaskElement: SimdElement + Sealed {}
 macro_rules! impl_element {
     { $ty:ty } => {
         impl Sealed for $ty {
+            #[inline]
             fn valid<const LANES: usize>(value: Simd<Self, LANES>) -> bool
             where
                 LaneCount<LANES>: SupportedLaneCount,
@@ -62,6 +63,7 @@ macro_rules! impl_element {
                 (value.simd_eq(Simd::splat(0 as _)) | value.simd_eq(Simd::splat(-1 as _))).all()
             }
 
+            #[inline]
             fn eq(self, other: Self) -> bool { self == other }
 
             const TRUE: Self = -1;
@@ -83,7 +85,9 @@ impl_element! { isize }
 ///
 /// Masks represent boolean inclusion/exclusion on a per-lane basis.
 ///
-/// The layout of this type is unspecified.
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[T; LANES]`.
 #[repr(transparent)]
 pub struct Mask<T, const LANES: usize>(mask_impl::Mask<T, LANES>)
 where
@@ -102,6 +106,7 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn clone(&self) -> Self {
         *self
     }
@@ -113,11 +118,13 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     /// Construct a mask by setting all lanes to the given value.
+    #[inline]
     pub fn splat(value: bool) -> Self {
         Self(mask_impl::Mask::splat(value))
     }
 
     /// Converts an array of bools to a SIMD mask.
+    #[inline]
     pub fn from_array(array: [bool; LANES]) -> Self {
         // SAFETY: Rust's bool has a layout of 1 byte (u8) with a value of
         //     true:    0b_0000_0001
@@ -134,6 +141,7 @@ where
     }
 
     /// Converts a SIMD mask to an array of bools.
+    #[inline]
     pub fn to_array(self) -> [bool; LANES] {
         // This follows mostly the same logic as from_array.
         // SAFETY: Rust's bool has a layout of 1 byte (u8) with a value of
@@ -261,6 +269,7 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn from(array: [bool; LANES]) -> Self {
         Self::from_array(array)
     }
@@ -271,6 +280,7 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn from(vector: Mask<T, LANES>) -> Self {
         vector.to_array()
     }
@@ -520,60 +530,6 @@ where
     }
 }
 
-/// A mask for SIMD vectors with eight elements of 8 bits.
-pub type mask8x8 = Mask<i8, 8>;
-
-/// A mask for SIMD vectors with 16 elements of 8 bits.
-pub type mask8x16 = Mask<i8, 16>;
-
-/// A mask for SIMD vectors with 32 elements of 8 bits.
-pub type mask8x32 = Mask<i8, 32>;
-
-/// A mask for SIMD vectors with 64 elements of 8 bits.
-pub type mask8x64 = Mask<i8, 64>;
-
-/// A mask for SIMD vectors with four elements of 16 bits.
-pub type mask16x4 = Mask<i16, 4>;
-
-/// A mask for SIMD vectors with eight elements of 16 bits.
-pub type mask16x8 = Mask<i16, 8>;
-
-/// A mask for SIMD vectors with 16 elements of 16 bits.
-pub type mask16x16 = Mask<i16, 16>;
-
-/// A mask for SIMD vectors with 32 elements of 16 bits.
-pub type mask16x32 = Mask<i16, 32>;
-
-/// A mask for SIMD vectors with two elements of 32 bits.
-pub type mask32x2 = Mask<i32, 2>;
-
-/// A mask for SIMD vectors with four elements of 32 bits.
-pub type mask32x4 = Mask<i32, 4>;
-
-/// A mask for SIMD vectors with eight elements of 32 bits.
-pub type mask32x8 = Mask<i32, 8>;
-
-/// A mask for SIMD vectors with 16 elements of 32 bits.
-pub type mask32x16 = Mask<i32, 16>;
-
-/// A mask for SIMD vectors with two elements of 64 bits.
-pub type mask64x2 = Mask<i64, 2>;
-
-/// A mask for SIMD vectors with four elements of 64 bits.
-pub type mask64x4 = Mask<i64, 4>;
-
-/// A mask for SIMD vectors with eight elements of 64 bits.
-pub type mask64x8 = Mask<i64, 8>;
-
-/// A mask for SIMD vectors with two elements of pointer width.
-pub type masksizex2 = Mask<isize, 2>;
-
-/// A mask for SIMD vectors with four elements of pointer width.
-pub type masksizex4 = Mask<isize, 4>;
-
-/// A mask for SIMD vectors with eight elements of pointer width.
-pub type masksizex8 = Mask<isize, 8>;
-
 macro_rules! impl_from {
     { $from:ty  => $($to:ty),* } => {
         $(
@@ -581,6 +537,7 @@ macro_rules! impl_from {
         where
             LaneCount<LANES>: SupportedLaneCount,
         {
+            #[inline]
             fn from(value: Mask<$from, LANES>) -> Self {
                 value.cast()
             }
diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index 365ecc0a325..20465ba9b07 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -26,6 +26,7 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn clone(&self) -> Self {
         *self
     }
@@ -36,6 +37,7 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn eq(&self, other: &Self) -> bool {
         self.0.as_ref() == other.0.as_ref()
     }
@@ -46,6 +48,7 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
         self.0.as_ref().partial_cmp(other.0.as_ref())
     }
@@ -63,6 +66,7 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn cmp(&self, other: &Self) -> core::cmp::Ordering {
         self.0.as_ref().cmp(other.0.as_ref())
     }
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index b5ba198e504..1d13c45b8e7 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -37,6 +37,7 @@ where
     T: MaskElement + PartialEq,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn eq(&self, other: &Self) -> bool {
         self.0.eq(&other.0)
     }
@@ -47,6 +48,7 @@ where
     T: MaskElement + PartialOrd,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
         self.0.partial_cmp(&other.0)
     }
@@ -64,6 +66,7 @@ where
     T: MaskElement + Ord,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn cmp(&self, other: &Self) -> core::cmp::Ordering {
         self.0.cmp(&other.0)
     }
@@ -262,6 +265,7 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn from(value: Mask<T, LANES>) -> Self {
         value.0
     }
diff --git a/crates/core_simd/src/masks/to_bitmask.rs b/crates/core_simd/src/masks/to_bitmask.rs
index 2235f016c71..fc7d6b781f2 100644
--- a/crates/core_simd/src/masks/to_bitmask.rs
+++ b/crates/core_simd/src/masks/to_bitmask.rs
@@ -48,10 +48,12 @@ macro_rules! impl_integer_intrinsic {
         impl<T: MaskElement> ToBitMask for Mask<T, $lanes> {
             type BitMask = $int;
 
+            #[inline]
             fn to_bitmask(self) -> $int {
                 self.0.to_bitmask_integer()
             }
 
+            #[inline]
             fn from_bitmask(bitmask: $int) -> Self {
                 Self(mask_impl::Mask::from_bitmask_integer(bitmask))
             }
@@ -83,10 +85,12 @@ where
 {
     const BYTES: usize = bitmask_len(LANES);
 
+    #[inline]
     fn to_bitmask_array(self) -> [u8; Self::BYTES] {
         self.0.to_bitmask_array()
     }
 
+    #[inline]
     fn from_bitmask_array(bitmask: [u8; Self::BYTES]) -> Self {
         Mask(mask_impl::Mask::from_bitmask_array(bitmask))
     }
diff --git a/crates/core_simd/src/mod.rs b/crates/core_simd/src/mod.rs
index b472aa3abe2..35c659b7a42 100644
--- a/crates/core_simd/src/mod.rs
+++ b/crates/core_simd/src/mod.rs
@@ -6,6 +6,8 @@ pub(crate) mod intrinsics;
 #[cfg(feature = "generic_const_exprs")]
 mod to_bytes;
 
+mod alias;
+mod cast;
 mod elements;
 mod eq;
 mod fmt;
@@ -15,6 +17,7 @@ mod masks;
 mod ops;
 mod ord;
 mod select;
+mod swizzle_dyn;
 mod vector;
 mod vendor;
 
@@ -22,11 +25,14 @@ mod vendor;
 pub mod simd {
     pub(crate) use crate::core_simd::intrinsics;
 
+    pub use crate::core_simd::alias::*;
+    pub use crate::core_simd::cast::*;
     pub use crate::core_simd::elements::*;
     pub use crate::core_simd::eq::*;
     pub use crate::core_simd::lane_count::{LaneCount, SupportedLaneCount};
     pub use crate::core_simd::masks::*;
     pub use crate::core_simd::ord::*;
     pub use crate::core_simd::swizzle::*;
+    pub use crate::core_simd::swizzle_dyn::*;
     pub use crate::core_simd::vector::*;
 }
diff --git a/crates/core_simd/src/ord.rs b/crates/core_simd/src/ord.rs
index 9a87bc2e344..1ae9cd061fb 100644
--- a/crates/core_simd/src/ord.rs
+++ b/crates/core_simd/src/ord.rs
@@ -1,4 +1,6 @@
-use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
+use crate::simd::{
+    intrinsics, LaneCount, Mask, Simd, SimdConstPtr, SimdMutPtr, SimdPartialEq, SupportedLaneCount,
+};
 
 /// Parallel `PartialOrd`.
 pub trait SimdPartialOrd: SimdPartialEq {
@@ -211,3 +213,101 @@ macro_rules! impl_mask {
 }
 
 impl_mask! { i8, i16, i32, i64, isize }
+
+impl<T, const LANES: usize> SimdPartialOrd for Simd<*const T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn simd_lt(self, other: Self) -> Self::Mask {
+        self.addr().simd_lt(other.addr())
+    }
+
+    #[inline]
+    fn simd_le(self, other: Self) -> Self::Mask {
+        self.addr().simd_le(other.addr())
+    }
+
+    #[inline]
+    fn simd_gt(self, other: Self) -> Self::Mask {
+        self.addr().simd_gt(other.addr())
+    }
+
+    #[inline]
+    fn simd_ge(self, other: Self) -> Self::Mask {
+        self.addr().simd_ge(other.addr())
+    }
+}
+
+impl<T, const LANES: usize> SimdOrd for Simd<*const T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn simd_max(self, other: Self) -> Self {
+        self.simd_lt(other).select(other, self)
+    }
+
+    #[inline]
+    fn simd_min(self, other: Self) -> Self {
+        self.simd_gt(other).select(other, self)
+    }
+
+    #[inline]
+    fn simd_clamp(self, min: Self, max: Self) -> Self {
+        assert!(
+            min.simd_le(max).all(),
+            "each lane in `min` must be less than or equal to the corresponding lane in `max`",
+        );
+        self.simd_max(min).simd_min(max)
+    }
+}
+
+impl<T, const LANES: usize> SimdPartialOrd for Simd<*mut T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn simd_lt(self, other: Self) -> Self::Mask {
+        self.addr().simd_lt(other.addr())
+    }
+
+    #[inline]
+    fn simd_le(self, other: Self) -> Self::Mask {
+        self.addr().simd_le(other.addr())
+    }
+
+    #[inline]
+    fn simd_gt(self, other: Self) -> Self::Mask {
+        self.addr().simd_gt(other.addr())
+    }
+
+    #[inline]
+    fn simd_ge(self, other: Self) -> Self::Mask {
+        self.addr().simd_ge(other.addr())
+    }
+}
+
+impl<T, const LANES: usize> SimdOrd for Simd<*mut T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn simd_max(self, other: Self) -> Self {
+        self.simd_lt(other).select(other, self)
+    }
+
+    #[inline]
+    fn simd_min(self, other: Self) -> Self {
+        self.simd_gt(other).select(other, self)
+    }
+
+    #[inline]
+    fn simd_clamp(self, min: Self, max: Self) -> Self {
+        assert!(
+            min.simd_le(max).all(),
+            "each lane in `min` must be less than or equal to the corresponding lane in `max`",
+        );
+        self.simd_max(min).simd_min(max)
+    }
+}
diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 22999d24950..68f20516cf5 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -265,16 +265,13 @@ where
 
     /// Interleave two vectors.
     ///
-    /// Produces two vectors with lanes taken alternately from `self` and `other`.
+    /// The resulting vectors contain lanes taken alternatively from `self` and `other`, first
+    /// filling the first result, and then the second.
     ///
-    /// The first result contains the first `LANES / 2` lanes from `self` and `other`,
-    /// alternating, starting with the first lane of `self`.
-    ///
-    /// The second result contains the last `LANES / 2` lanes from `self` and `other`,
-    /// alternating, starting with the lane `LANES / 2` from the start of `self`.
+    /// The reverse of this operation is [`Simd::deinterleave`].
     ///
     /// ```
-    /// #![feature(portable_simd)]
+    /// # #![feature(portable_simd)]
     /// # use core::simd::Simd;
     /// let a = Simd::from_array([0, 1, 2, 3]);
     /// let b = Simd::from_array([4, 5, 6, 7]);
@@ -285,29 +282,17 @@ where
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn interleave(self, other: Self) -> (Self, Self) {
-        const fn lo<const LANES: usize>() -> [Which; LANES] {
+        const fn interleave<const LANES: usize>(high: bool) -> [Which; LANES] {
             let mut idx = [Which::First(0); LANES];
             let mut i = 0;
             while i < LANES {
-                let offset = i / 2;
-                idx[i] = if i % 2 == 0 {
-                    Which::First(offset)
+                // Treat the source as a concatenated vector
+                let dst_index = if high { i + LANES } else { i };
+                let src_index = dst_index / 2 + (dst_index % 2) * LANES;
+                idx[i] = if src_index < LANES {
+                    Which::First(src_index)
                 } else {
-                    Which::Second(offset)
-                };
-                i += 1;
-            }
-            idx
-        }
-        const fn hi<const LANES: usize>() -> [Which; LANES] {
-            let mut idx = [Which::First(0); LANES];
-            let mut i = 0;
-            while i < LANES {
-                let offset = (LANES + i) / 2;
-                idx[i] = if i % 2 == 0 {
-                    Which::First(offset)
-                } else {
-                    Which::Second(offset)
+                    Which::Second(src_index % LANES)
                 };
                 i += 1;
             }
@@ -318,11 +303,11 @@ where
         struct Hi;
 
         impl<const LANES: usize> Swizzle2<LANES, LANES> for Lo {
-            const INDEX: [Which; LANES] = lo::<LANES>();
+            const INDEX: [Which; LANES] = interleave::<LANES>(false);
         }
 
         impl<const LANES: usize> Swizzle2<LANES, LANES> for Hi {
-            const INDEX: [Which; LANES] = hi::<LANES>();
+            const INDEX: [Which; LANES] = interleave::<LANES>(true);
         }
 
         (Lo::swizzle2(self, other), Hi::swizzle2(self, other))
@@ -336,8 +321,10 @@ where
     /// The second result takes every other lane of `self` and then `other`, starting with
     /// the second lane.
     ///
+    /// The reverse of this operation is [`Simd::interleave`].
+    ///
     /// ```
-    /// #![feature(portable_simd)]
+    /// # #![feature(portable_simd)]
     /// # use core::simd::Simd;
     /// let a = Simd::from_array([0, 4, 1, 5]);
     /// let b = Simd::from_array([2, 6, 3, 7]);
@@ -348,22 +335,17 @@ where
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn deinterleave(self, other: Self) -> (Self, Self) {
-        const fn even<const LANES: usize>() -> [Which; LANES] {
+        const fn deinterleave<const LANES: usize>(second: bool) -> [Which; LANES] {
             let mut idx = [Which::First(0); LANES];
             let mut i = 0;
-            while i < LANES / 2 {
-                idx[i] = Which::First(2 * i);
-                idx[i + LANES / 2] = Which::Second(2 * i);
-                i += 1;
-            }
-            idx
-        }
-        const fn odd<const LANES: usize>() -> [Which; LANES] {
-            let mut idx = [Which::First(0); LANES];
-            let mut i = 0;
-            while i < LANES / 2 {
-                idx[i] = Which::First(2 * i + 1);
-                idx[i + LANES / 2] = Which::Second(2 * i + 1);
+            while i < LANES {
+                // Treat the source as a concatenated vector
+                let src_index = i * 2 + second as usize;
+                idx[i] = if src_index < LANES {
+                    Which::First(src_index)
+                } else {
+                    Which::Second(src_index % LANES)
+                };
                 i += 1;
             }
             idx
@@ -373,11 +355,11 @@ where
         struct Odd;
 
         impl<const LANES: usize> Swizzle2<LANES, LANES> for Even {
-            const INDEX: [Which; LANES] = even::<LANES>();
+            const INDEX: [Which; LANES] = deinterleave::<LANES>(false);
         }
 
         impl<const LANES: usize> Swizzle2<LANES, LANES> for Odd {
-            const INDEX: [Which; LANES] = odd::<LANES>();
+            const INDEX: [Which; LANES] = deinterleave::<LANES>(true);
         }
 
         (Even::swizzle2(self, other), Odd::swizzle2(self, other))
diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
new file mode 100644
index 00000000000..3eb80d5dca1
--- /dev/null
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -0,0 +1,157 @@
+use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+use core::mem;
+
+impl<const N: usize> Simd<u8, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
+    /// Swizzle a vector of bytes according to the index vector.
+    /// Indices within range select the appropriate byte.
+    /// Indices "out of bounds" instead select 0.
+    ///
+    /// Note that the current implementation is selected during build-time
+    /// of the standard library, so `cargo build -Zbuild-std` may be necessary
+    /// to unlock better performance, especially for larger vectors.
+    /// A planned compiler improvement will enable using `#[target_feature]` instead.
+    #[inline]
+    pub fn swizzle_dyn(self, idxs: Simd<u8, N>) -> Self {
+        #![allow(unused_imports, unused_unsafe)]
+        #[cfg(target_arch = "aarch64")]
+        use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8};
+        #[cfg(all(target_arch = "arm", target_feature = "v7"))]
+        use core::arch::arm::{uint8x8_t, vtbl1_u8};
+        #[cfg(target_arch = "wasm32")]
+        use core::arch::wasm32 as wasm;
+        #[cfg(target_arch = "x86")]
+        use core::arch::x86;
+        #[cfg(target_arch = "x86_64")]
+        use core::arch::x86_64 as x86;
+        // SAFETY: Intrinsics covered by cfg
+        unsafe {
+            match N {
+                #[cfg(target_feature = "neon")]
+                8 => transize(vtbl1_u8, self, idxs),
+                #[cfg(target_feature = "ssse3")]
+                16 => transize(x86::_mm_shuffle_epi8, self, idxs),
+                #[cfg(target_feature = "simd128")]
+                16 => transize(wasm::i8x16_swizzle, self, idxs),
+                #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+                16 => transize(vqtbl1q_u8, self, idxs),
+                #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
+                32 => transize_raw(avx2_pshufb, self, idxs),
+                #[cfg(target_feature = "avx512vl,avx512vbmi")]
+                32 => transize(x86::_mm256_permutexvar_epi8, self, idxs),
+                // Notable absence: avx512bw shuffle
+                // If avx512bw is available, odds of avx512vbmi are good
+                // FIXME: initial AVX512VBMI variant didn't actually pass muster
+                // #[cfg(target_feature = "avx512vbmi")]
+                // 64 => transize(x86::_mm512_permutexvar_epi8, self, idxs),
+                _ => {
+                    let mut array = [0; N];
+                    for (i, k) in idxs.to_array().into_iter().enumerate() {
+                        if (k as usize) < N {
+                            array[i] = self[k as usize];
+                        };
+                    }
+                    array.into()
+                }
+            }
+        }
+    }
+}
+
+/// "vpshufb like it was meant to be" on AVX2
+///
+/// # Safety
+/// This requires AVX2 to work
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[target_feature(enable = "avx2")]
+#[allow(unused)]
+#[inline]
+#[allow(clippy::let_and_return)]
+unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> {
+    use crate::simd::SimdPartialOrd;
+    #[cfg(target_arch = "x86")]
+    use core::arch::x86;
+    #[cfg(target_arch = "x86_64")]
+    use core::arch::x86_64 as x86;
+    use x86::_mm256_permute2x128_si256 as avx2_cross_shuffle;
+    use x86::_mm256_shuffle_epi8 as avx2_half_pshufb;
+    let mid = Simd::splat(16u8);
+    let high = mid + mid;
+    // SAFETY: Caller promised AVX2
+    unsafe {
+        // This is ordering sensitive, and LLVM will order these how you put them.
+        // Most AVX2 impls use ~5 "ports", and only 1 or 2 are capable of permutes.
+        // But the "compose" step will lower to ops that can also use at least 1 other port.
+        // So this tries to break up permutes so composition flows through "open" ports.
+        // Comparative benches should be done on multiple AVX2 CPUs before reordering this
+
+        let hihi = avx2_cross_shuffle::<0x11>(bytes.into(), bytes.into());
+        let hi_shuf = Simd::from(avx2_half_pshufb(
+            hihi,        // duplicate the vector's top half
+            idxs.into(), // so that using only 4 bits of an index still picks bytes 16-31
+        ));
+        // A zero-fill during the compose step gives the "all-Neon-like" OOB-is-0 semantics
+        let compose = idxs.simd_lt(high).select(hi_shuf, Simd::splat(0));
+        let lolo = avx2_cross_shuffle::<0x00>(bytes.into(), bytes.into());
+        let lo_shuf = Simd::from(avx2_half_pshufb(lolo, idxs.into()));
+        // Repeat, then pick indices < 16, overwriting indices 0-15 from previous compose step
+        let compose = idxs.simd_lt(mid).select(lo_shuf, compose);
+        compose
+    }
+}
+
+/// This sets up a call to an architecture-specific function, and in doing so
+/// it persuades rustc that everything is the correct size. Which it is.
+/// This would not be needed if one could convince Rust that, by matching on N,
+/// N is that value, and thus it would be valid to substitute e.g. 16.
+///
+/// # Safety
+/// The correctness of this function hinges on the sizes agreeing in actuality.
+#[allow(dead_code)]
+#[inline(always)]
+unsafe fn transize<T, const N: usize>(
+    f: unsafe fn(T, T) -> T,
+    bytes: Simd<u8, N>,
+    idxs: Simd<u8, N>,
+) -> Simd<u8, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
+    let idxs = zeroing_idxs(idxs);
+    // SAFETY: Same obligation to use this function as to use mem::transmute_copy.
+    unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) }
+}
+
+/// Make indices that yield 0 for this architecture
+#[inline(always)]
+fn zeroing_idxs<const N: usize>(idxs: Simd<u8, N>) -> Simd<u8, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
+    // On x86, make sure the top bit is set.
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    let idxs = {
+        use crate::simd::SimdPartialOrd;
+        idxs.simd_lt(Simd::splat(N as u8))
+            .select(idxs, Simd::splat(u8::MAX))
+    };
+    // Simply do nothing on most architectures.
+    idxs
+}
+
+/// As transize but no implicit call to `zeroing_idxs`.
+#[allow(dead_code)]
+#[inline(always)]
+unsafe fn transize_raw<T, const N: usize>(
+    f: unsafe fn(T, T) -> T,
+    bytes: Simd<u8, N>,
+    idxs: Simd<u8, N>,
+) -> Simd<u8, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
+    // SAFETY: Same obligation to use this function as to use mem::transmute_copy.
+    unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) }
+}
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index d52d1ac4d3a..3809cc96151 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -1,60 +1,63 @@
-mod float;
-mod int;
-mod uint;
-
-pub use float::*;
-pub use int::*;
-pub use uint::*;
-
-// Vectors of pointers are not for public use at the current time.
-pub(crate) mod ptr;
-
 use crate::simd::{
-    intrinsics, LaneCount, Mask, MaskElement, SimdPartialOrd, SupportedLaneCount, Swizzle,
+    intrinsics, LaneCount, Mask, MaskElement, SimdCast, SimdCastPtr, SimdConstPtr, SimdMutPtr,
+    SimdPartialOrd, SupportedLaneCount, Swizzle,
 };
+use core::convert::{TryFrom, TryInto};
 
-/// A SIMD vector of `LANES` elements of type `T`. `Simd<T, N>` has the same shape as [`[T; N]`](array), but operates like `T`.
+/// A SIMD vector with the shape of `[T; N]` but the operations of `T`.
 ///
-/// Two vectors of the same type and length will, by convention, support the operators (+, *, etc.) that `T` does.
-/// These take the lanes at each index on the left-hand side and right-hand side, perform the operation,
-/// and return the result in the same lane in a vector of equal size. For a given operator, this is equivalent to zipping
-/// the two arrays together and mapping the operator over each lane.
+/// `Simd<T, N>` supports the operators (+, *, etc.) that `T` does in "elementwise" fashion.
+/// These take the element at each index from the left-hand side and right-hand side,
+/// perform the operation, then return the result in the same index in a vector of equal size.
+/// However, `Simd` differs from normal iteration and normal arrays:
+/// - `Simd<T, N>` executes `N` operations in a single step with no `break`s
+/// - `Simd<T, N>` can have an alignment greater than `T`, for better mechanical sympathy
+///
+/// By always imposing these constraints on `Simd`, it is easier to compile elementwise operations
+/// into machine instructions that can themselves be executed in parallel.
 ///
 /// ```rust
-/// # #![feature(array_zip, portable_simd)]
+/// # #![feature(portable_simd)]
 /// # use core::simd::{Simd};
-/// let a0: [i32; 4] = [-2, 0, 2, 4];
-/// let a1 = [10, 9, 8, 7];
-/// let zm_add = a0.zip(a1).map(|(lhs, rhs)| lhs + rhs);
-/// let zm_mul = a0.zip(a1).map(|(lhs, rhs)| lhs * rhs);
+/// # use core::array;
+/// let a: [i32; 4] = [-2, 0, 2, 4];
+/// let b = [10, 9, 8, 7];
+/// let sum = array::from_fn(|i| a[i] + b[i]);
+/// let prod = array::from_fn(|i| a[i] * b[i]);
 ///
 /// // `Simd<T, N>` implements `From<[T; N]>`
-/// let (v0, v1) = (Simd::from(a0), Simd::from(a1));
+/// let (v, w) = (Simd::from(a), Simd::from(b));
 /// // Which means arrays implement `Into<Simd<T, N>>`.
-/// assert_eq!(v0 + v1, zm_add.into());
-/// assert_eq!(v0 * v1, zm_mul.into());
+/// assert_eq!(v + w, sum.into());
+/// assert_eq!(v * w, prod.into());
 /// ```
 ///
-/// `Simd` with integers has the quirk that these operations are also inherently wrapping, as if `T` was [`Wrapping<T>`].
+///
+/// `Simd` with integer elements treats operators as wrapping, as if `T` was [`Wrapping<T>`].
 /// Thus, `Simd` does not implement `wrapping_add`, because that is the default behavior.
 /// This means there is no warning on overflows, even in "debug" builds.
 /// For most applications where `Simd` is appropriate, it is "not a bug" to wrap,
 /// and even "debug builds" are unlikely to tolerate the loss of performance.
 /// You may want to consider using explicitly checked arithmetic if such is required.
-/// Division by zero still causes a panic, so you may want to consider using floating point numbers if that is unacceptable.
+/// Division by zero on integers still causes a panic, so
+/// you may want to consider using `f32` or `f64` if that is unacceptable.
 ///
 /// [`Wrapping<T>`]: core::num::Wrapping
 ///
 /// # Layout
-/// `Simd<T, N>` has a layout similar to `[T; N]` (identical "shapes"), but with a greater alignment.
+/// `Simd<T, N>` has a layout similar to `[T; N]` (identical "shapes"), with a greater alignment.
 /// `[T; N]` is aligned to `T`, but `Simd<T, N>` will have an alignment based on both `T` and `N`.
-/// It is thus sound to [`transmute`] `Simd<T, N>` to `[T; N]`, and will typically optimize to zero cost,
-/// but the reverse transmutation is more likely to require a copy the compiler cannot simply elide.
+/// Thus it is sound to [`transmute`] `Simd<T, N>` to `[T; N]` and should optimize to "zero cost",
+/// but the reverse transmutation may require a copy the compiler cannot simply elide.
 ///
 /// # ABI "Features"
-/// Due to Rust's safety guarantees, `Simd<T, N>` is currently passed to and from functions via memory, not SIMD registers,
-/// except as an optimization. `#[inline]` hints are recommended on functions that accept `Simd<T, N>` or return it.
-/// The need for this may be corrected in the future.
+/// Due to Rust's safety guarantees, `Simd<T, N>` is currently passed and returned via memory,
+/// not SIMD registers, except as an optimization. Using `#[inline]` on functions that accept
+/// `Simd<T, N>` or return it is recommended, at the cost of code generation time, as
+/// inlining SIMD-using functions can omit a large function prolog or epilog and thus
+/// improve both speed and code size. The need for this may be corrected in the future.
+///
+/// Using `#[inline(always)]` still requires additional care.
 ///
 /// # Safe SIMD with Unsafe Rust
 ///
@@ -65,18 +68,22 @@ use crate::simd::{
 /// Thus, when using `unsafe` Rust to read and write `Simd<T, N>` through [raw pointers], it is a good idea to first try with
 /// [`read_unaligned`] and [`write_unaligned`]. This is because:
 /// - [`read`] and [`write`] require full alignment (in this case, `Simd<T, N>`'s alignment)
-/// - the likely source for reading or destination for writing `Simd<T, N>` is [`[T]`](slice) and similar types, aligned to `T`
-/// - combining these actions would violate the `unsafe` contract and explode the program into a puff of **undefined behavior**
-/// - the compiler can implicitly adjust layouts to make unaligned reads or writes fully aligned if it sees the optimization
-/// - most contemporary processors suffer no performance penalty for "unaligned" reads and writes that are aligned at runtime
+/// - `Simd<T, N>` is often read from or written to [`[T]`](slice) and other types aligned to `T`
+/// - combining these actions violates the `unsafe` contract and explodes the program into
+///   a puff of **undefined behavior**
+/// - the compiler can implicitly adjust layouts to make unaligned reads or writes fully aligned
+///   if it sees the optimization
+/// - most contemporary processors with "aligned" and "unaligned" read and write instructions
+///   exhibit no performance difference if the "unaligned" variant is aligned at runtime
 ///
-/// By imposing less obligations, unaligned functions are less likely to make the program unsound,
+/// Less obligations mean unaligned reads and writes are less likely to make the program unsound,
 /// and may be just as fast as stricter alternatives.
-/// When trying to guarantee alignment, [`[T]::as_simd`][as_simd] is an option for converting `[T]` to `[Simd<T, N>]`,
-/// and allows soundly operating on an aligned SIMD body, but it may cost more time when handling the scalar head and tail.
-/// If these are not sufficient, then it is most ideal to design data structures to be already aligned
-/// to the `Simd<T, N>` you wish to use before using `unsafe` Rust to read or write.
-/// More conventional ways to compensate for these facts, like materializing `Simd` to or from an array first,
+/// When trying to guarantee alignment, [`[T]::as_simd`][as_simd] is an option for
+/// converting `[T]` to `[Simd<T, N>]`, and allows soundly operating on an aligned SIMD body,
+/// but it may cost more time when handling the scalar head and tail.
+/// If these are not enough, it is most ideal to design data structures to be already aligned
+/// to `mem::align_of::<Simd<T, N>>()` before using `unsafe` Rust to read or write.
+/// Other ways to compensate for these facts, like materializing `Simd` to or from an array first,
 /// are handled by safe methods like [`Simd::from_array`] and [`Simd::from_slice`].
 ///
 /// [`transmute`]: core::mem::transmute
@@ -86,21 +93,26 @@ use crate::simd::{
 /// [`read`]: pointer::read
 /// [`write`]: pointer::write
 /// [as_simd]: slice::as_simd
+//
+// NOTE: Accessing the inner array directly in any way (e.g. by using the `.0` field syntax) or
+// directly constructing an instance of the type (i.e. `let vector = Simd(array)`) should be
+// avoided, as it will likely become illegal on `#[repr(simd)]` structs in the future. It also
+// causes rustc to emit illegal LLVM IR in some cases.
 #[repr(simd)]
-pub struct Simd<T, const LANES: usize>([T; LANES])
+pub struct Simd<T, const N: usize>([T; N])
 where
-    T: SimdElement,
-    LaneCount<LANES>: SupportedLaneCount;
+    LaneCount<N>: SupportedLaneCount,
+    T: SimdElement;
 
-impl<T, const LANES: usize> Simd<T, LANES>
+impl<T, const N: usize> Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
-    /// Number of lanes in this vector.
-    pub const LANES: usize = LANES;
+    /// Number of elements in this vector.
+    pub const LANES: usize = N;
 
-    /// Returns the number of lanes in this SIMD vector.
+    /// Returns the number of elements in this SIMD vector.
     ///
     /// # Examples
     ///
@@ -111,10 +123,10 @@ where
     /// assert_eq!(v.lanes(), 4);
     /// ```
     pub const fn lanes(&self) -> usize {
-        LANES
+        Self::LANES
     }
 
-    /// Constructs a new SIMD vector with all lanes set to the given value.
+    /// Constructs a new SIMD vector with all elements set to the given value.
     ///
     /// # Examples
     ///
@@ -125,11 +137,11 @@ where
     /// assert_eq!(v.as_array(), &[8, 8, 8, 8]);
     /// ```
     pub fn splat(value: T) -> Self {
-        // This is preferred over `[value; LANES]`, since it's explicitly a splat:
+        // This is preferred over `[value; N]`, since it's explicitly a splat:
         // https://github.com/rust-lang/rust/issues/97804
         struct Splat;
-        impl<const LANES: usize> Swizzle<1, LANES> for Splat {
-            const INDEX: [usize; LANES] = [0; LANES];
+        impl<const N: usize> Swizzle<1, N> for Splat {
+            const INDEX: [usize; N] = [0; N];
         }
         Splat::swizzle(Simd::<T, 1>::from([value]))
     }
@@ -144,32 +156,100 @@ where
     /// let v: u64x4 = Simd::from_array([0, 1, 2, 3]);
     /// assert_eq!(v.as_array(), &[0, 1, 2, 3]);
     /// ```
-    pub const fn as_array(&self) -> &[T; LANES] {
-        &self.0
+    pub const fn as_array(&self) -> &[T; N] {
+        // SAFETY: `Simd<T, N>` is just an overaligned `[T; N]` with
+        // potential padding at the end, so pointer casting to a
+        // `&[T; N]` is safe.
+        //
+        // NOTE: This deliberately doesn't just use `&self.0`, see the comment
+        // on the struct definition for details.
+        unsafe { &*(self as *const Self as *const [T; N]) }
     }
 
     /// Returns a mutable array reference containing the entire SIMD vector.
-    pub fn as_mut_array(&mut self) -> &mut [T; LANES] {
-        &mut self.0
+    pub fn as_mut_array(&mut self) -> &mut [T; N] {
+        // SAFETY: `Simd<T, N>` is just an overaligned `[T; N]` with
+        // potential padding at the end, so pointer casting to a
+        // `&mut [T; N]` is safe.
+        //
+        // NOTE: This deliberately doesn't just use `&mut self.0`, see the comment
+        // on the struct definition for details.
+        unsafe { &mut *(self as *mut Self as *mut [T; N]) }
+    }
+
+    /// Load a vector from an array of `T`.
+    ///
+    /// This function is necessary since `repr(simd)` has padding for non-power-of-2 vectors (at the time of writing).
+    /// With padding, `read_unaligned` will read past the end of an array of N elements.
+    ///
+    /// # Safety
+    /// Reading `ptr` must be safe, as if by `<*const [T; N]>::read_unaligned`.
+    const unsafe fn load(ptr: *const [T; N]) -> Self {
+        // There are potentially simpler ways to write this function, but this should result in
+        // LLVM `load <N x T>`
+
+        let mut tmp = core::mem::MaybeUninit::<Self>::uninit();
+        // SAFETY: `Simd<T, N>` always contains `N` elements of type `T`.  It may have padding
+        // which does not need to be initialized.  The safety of reading `ptr` is ensured by the
+        // caller.
+        unsafe {
+            core::ptr::copy_nonoverlapping(ptr, tmp.as_mut_ptr().cast(), 1);
+            tmp.assume_init()
+        }
+    }
+
+    /// Store a vector to an array of `T`.
+    ///
+    /// See `load` as to why this function is necessary.
+    ///
+    /// # Safety
+    /// Writing to `ptr` must be safe, as if by `<*mut [T; N]>::write_unaligned`.
+    const unsafe fn store(self, ptr: *mut [T; N]) {
+        // There are potentially simpler ways to write this function, but this should result in
+        // LLVM `store <N x T>`
+
+        // Creating a temporary helps LLVM turn the memcpy into a store.
+        let tmp = self;
+        // SAFETY: `Simd<T, N>` always contains `N` elements of type `T`.  The safety of writing
+        // `ptr` is ensured by the caller.
+        unsafe { core::ptr::copy_nonoverlapping(tmp.as_array(), ptr, 1) }
     }
 
     /// Converts an array to a SIMD vector.
-    pub const fn from_array(array: [T; LANES]) -> Self {
-        Self(array)
+    pub const fn from_array(array: [T; N]) -> Self {
+        // SAFETY: `&array` is safe to read.
+        //
+        // FIXME: We currently use a pointer load instead of `transmute_copy` because `repr(simd)`
+        // results in padding for non-power-of-2 vectors (so vectors are larger than arrays).
+        //
+        // NOTE: This deliberately doesn't just use `Self(array)`, see the comment
+        // on the struct definition for details.
+        unsafe { Self::load(&array) }
     }
 
     /// Converts a SIMD vector to an array.
-    pub const fn to_array(self) -> [T; LANES] {
-        self.0
+    pub const fn to_array(self) -> [T; N] {
+        let mut tmp = core::mem::MaybeUninit::uninit();
+        // SAFETY: writing to `tmp` is safe and initializes it.
+        //
+        // FIXME: We currently use a pointer store instead of `transmute_copy` because `repr(simd)`
+        // results in padding for non-power-of-2 vectors (so vectors are larger than arrays).
+        //
+        // NOTE: This deliberately doesn't just use `self.0`, see the comment
+        // on the struct definition for details.
+        unsafe {
+            self.store(tmp.as_mut_ptr());
+            tmp.assume_init()
+        }
     }
 
-    /// Converts a slice to a SIMD vector containing `slice[..LANES]`.
+    /// Converts a slice to a SIMD vector containing `slice[..N]`.
     ///
     /// # Panics
     ///
-    /// Panics if the slice's length is less than the vector's `Simd::LANES`.
+    /// Panics if the slice's length is less than the vector's `Simd::N`.
     ///
-    /// # Examples
+    /// # Example
     ///
     /// ```
     /// # #![feature(portable_simd)]
@@ -180,22 +260,49 @@ where
     /// ```
     #[must_use]
     pub const fn from_slice(slice: &[T]) -> Self {
-        assert!(slice.len() >= LANES, "slice length must be at least the number of lanes");
-        let mut array = [slice[0]; LANES];
-        let mut i = 0;
-        while i < LANES {
-            array[i] = slice[i];
-            i += 1;
-        }
-        Self(array)
+        assert!(
+            slice.len() >= Self::LANES,
+            "slice length must be at least the number of elements"
+        );
+        // SAFETY: We just checked that the slice contains
+        // at least `N` elements.
+        unsafe { Self::load(slice.as_ptr().cast()) }
     }
 
-    /// Performs lanewise conversion of a SIMD vector's elements to another SIMD-valid type.
+    /// Writes a SIMD vector to the first `N` elements of a slice.
     ///
-    /// This follows the semantics of Rust's `as` conversion for casting
-    /// integers to unsigned integers (interpreting as the other type, so `-1` to `MAX`),
-    /// and from floats to integers (truncating, or saturating at the limits) for each lane,
-    /// or vice versa.
+    /// # Panics
+    ///
+    /// Panics if the slice's length is less than the vector's `Simd::N`.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::u32x4;
+    /// let mut dest = vec![0; 6];
+    /// let v = u32x4::from_array([1, 2, 3, 4]);
+    /// v.copy_to_slice(&mut dest);
+    /// assert_eq!(&dest, &[1, 2, 3, 4, 0, 0]);
+    /// ```
+    pub fn copy_to_slice(self, slice: &mut [T]) {
+        assert!(
+            slice.len() >= Self::LANES,
+            "slice length must be at least the number of elements"
+        );
+        // SAFETY: We just checked that the slice contains
+        // at least `N` elements.
+        unsafe { self.store(slice.as_mut_ptr().cast()) }
+    }
+
+    /// Performs elementwise conversion of a SIMD vector's elements to another SIMD-valid type.
+    ///
+    /// This follows the semantics of Rust's `as` conversion for casting integers between
+    /// signed and unsigned (interpreting integers as 2s complement, so `-1` to `U::MAX` and
+    /// `1 << (U::BITS -1)` becoming `I::MIN` ), and from floats to integers (truncating,
+    /// or saturating at the limits) for each element.
     ///
     /// # Examples
     /// ```
@@ -215,11 +322,26 @@ where
     /// ```
     #[must_use]
     #[inline]
-    pub fn cast<U: SimdElement>(self) -> Simd<U, LANES> {
-        // Safety: The input argument is a vector of a valid SIMD element type.
+    pub fn cast<U: SimdCast>(self) -> Simd<U, N>
+    where
+        T: SimdCast,
+    {
+        // Safety: supported types are guaranteed by SimdCast
         unsafe { intrinsics::simd_as(self) }
     }
 
+    /// Casts a vector of pointers to another pointer type.
+    #[must_use]
+    #[inline]
+    pub fn cast_ptr<U>(self) -> Simd<U, N>
+    where
+        T: SimdCastPtr<U>,
+        U: SimdElement,
+    {
+        // Safety: supported types are guaranteed by SimdCastPtr
+        unsafe { intrinsics::simd_cast_ptr(self) }
+    }
+
     /// Rounds toward zero and converts to the same-width integer type, assuming that
     /// the value is finite and fits in that type.
     ///
@@ -235,90 +357,90 @@ where
     ///
     /// [cast]: Simd::cast
     #[inline]
-    pub unsafe fn to_int_unchecked<I>(self) -> Simd<I, LANES>
+    #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
+    pub unsafe fn to_int_unchecked<I>(self) -> Simd<I, N>
     where
-        T: core::convert::FloatToInt<I>,
-        I: SimdElement,
+        T: core::convert::FloatToInt<I> + SimdCast,
+        I: SimdCast,
     {
-        // Safety: `self` is a vector, and `FloatToInt` ensures the type can be casted to
-        // an integer.
+        // Safety: supported types are guaranteed by SimdCast, the caller is responsible for the extra invariants
         unsafe { intrinsics::simd_cast(self) }
     }
 
     /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
-    /// If an index is out-of-bounds, the lane is instead selected from the `or` vector.
+    /// If an index is out-of-bounds, the element is instead selected from the `or` vector.
     ///
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
     /// # use core::simd::Simd;
     /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
-    /// let idxs = Simd::from_array([9, 3, 0, 5]);
+    /// let idxs = Simd::from_array([9, 3, 0, 5]);  // Note the index that is out-of-bounds
     /// let alt = Simd::from_array([-5, -4, -3, -2]);
     ///
-    /// let result = Simd::gather_or(&vec, idxs, alt); // Note the lane that is out-of-bounds.
+    /// let result = Simd::gather_or(&vec, idxs, alt);
     /// assert_eq!(result, Simd::from_array([-5, 13, 10, 15]));
     /// ```
     #[must_use]
     #[inline]
-    pub fn gather_or(slice: &[T], idxs: Simd<usize, LANES>, or: Self) -> Self {
+    pub fn gather_or(slice: &[T], idxs: Simd<usize, N>, or: Self) -> Self {
         Self::gather_select(slice, Mask::splat(true), idxs, or)
     }
 
-    /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
-    /// If an index is out-of-bounds, the lane is set to the default value for the type.
+    /// Reads from indices in `slice` to construct a SIMD vector.
+    /// If an index is out-of-bounds, the element is set to the default given by `T: Default`.
     ///
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
     /// # use core::simd::Simd;
     /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
-    /// let idxs = Simd::from_array([9, 3, 0, 5]);
+    /// let idxs = Simd::from_array([9, 3, 0, 5]);  // Note the index that is out-of-bounds
     ///
-    /// let result = Simd::gather_or_default(&vec, idxs); // Note the lane that is out-of-bounds.
+    /// let result = Simd::gather_or_default(&vec, idxs);
     /// assert_eq!(result, Simd::from_array([0, 13, 10, 15]));
     /// ```
     #[must_use]
     #[inline]
-    pub fn gather_or_default(slice: &[T], idxs: Simd<usize, LANES>) -> Self
+    pub fn gather_or_default(slice: &[T], idxs: Simd<usize, N>) -> Self
     where
         T: Default,
     {
         Self::gather_or(slice, idxs, Self::splat(T::default()))
     }
 
-    /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
-    /// The mask `enable`s all `true` lanes and disables all `false` lanes.
-    /// If an index is disabled or is out-of-bounds, the lane is selected from the `or` vector.
+    /// Reads from indices in `slice` to construct a SIMD vector.
+    /// The mask `enable`s all `true` indices and disables all `false` indices.
+    /// If an index is disabled or is out-of-bounds, the element is selected from the `or` vector.
     ///
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
     /// # use core::simd::{Simd, Mask};
     /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
-    /// let idxs = Simd::from_array([9, 3, 0, 5]);
+    /// let idxs = Simd::from_array([9, 3, 0, 5]); // Includes an out-of-bounds index
     /// let alt = Simd::from_array([-5, -4, -3, -2]);
-    /// let enable = Mask::from_array([true, true, true, false]); // Note the mask of the last lane.
+    /// let enable = Mask::from_array([true, true, true, false]); // Includes a masked element
     ///
-    /// let result = Simd::gather_select(&vec, enable, idxs, alt); // Note the lane that is out-of-bounds.
+    /// let result = Simd::gather_select(&vec, enable, idxs, alt);
     /// assert_eq!(result, Simd::from_array([-5, 13, 10, -2]));
     /// ```
     #[must_use]
     #[inline]
     pub fn gather_select(
         slice: &[T],
-        enable: Mask<isize, LANES>,
-        idxs: Simd<usize, LANES>,
+        enable: Mask<isize, N>,
+        idxs: Simd<usize, N>,
         or: Self,
     ) -> Self {
-        let enable: Mask<isize, LANES> = enable & idxs.simd_lt(Simd::splat(slice.len()));
-        // Safety: We have masked-off out-of-bounds lanes.
+        let enable: Mask<isize, N> = enable & idxs.simd_lt(Simd::splat(slice.len()));
+        // Safety: We have masked-off out-of-bounds indices.
         unsafe { Self::gather_select_unchecked(slice, enable, idxs, or) }
     }
 
-    /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
-    /// The mask `enable`s all `true` lanes and disables all `false` lanes.
-    /// If an index is disabled, the lane is selected from the `or` vector.
+    /// Reads from indices in `slice` to construct a SIMD vector.
+    /// The mask `enable`s all `true` indices and disables all `false` indices.
+    /// If an index is disabled, the element is selected from the `or` vector.
     ///
     /// # Safety
     ///
@@ -332,57 +454,123 @@ where
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
     /// # use simd::{Simd, SimdPartialOrd, Mask};
     /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
-    /// let idxs = Simd::from_array([9, 3, 0, 5]);
+    /// let idxs = Simd::from_array([9, 3, 0, 5]); // Includes an out-of-bounds index
     /// let alt = Simd::from_array([-5, -4, -3, -2]);
-    /// let enable = Mask::from_array([true, true, true, false]); // Note the final mask lane.
+    /// let enable = Mask::from_array([true, true, true, false]); // Includes a masked element
     /// // If this mask was used to gather, it would be unsound. Let's fix that.
     /// let enable = enable & idxs.simd_lt(Simd::splat(vec.len()));
     ///
-    /// // We have masked the OOB lane, so it's safe to gather now.
+    /// // The out-of-bounds index has been masked, so it's safe to gather now.
     /// let result = unsafe { Simd::gather_select_unchecked(&vec, enable, idxs, alt) };
     /// assert_eq!(result, Simd::from_array([-5, 13, 10, -2]));
     /// ```
     /// [undefined behavior]: https://doc.rust-lang.org/reference/behavior-considered-undefined.html
     #[must_use]
     #[inline]
+    #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
     pub unsafe fn gather_select_unchecked(
         slice: &[T],
-        enable: Mask<isize, LANES>,
-        idxs: Simd<usize, LANES>,
+        enable: Mask<isize, N>,
+        idxs: Simd<usize, N>,
         or: Self,
     ) -> Self {
-        let base_ptr = crate::simd::ptr::SimdConstPtr::splat(slice.as_ptr());
+        let base_ptr = Simd::<*const T, N>::splat(slice.as_ptr());
         // Ferris forgive me, I have done pointer arithmetic here.
         let ptrs = base_ptr.wrapping_add(idxs);
-        // Safety: The ptrs have been bounds-masked to prevent memory-unsafe reads insha'allah
-        unsafe { intrinsics::simd_gather(or, ptrs, enable.to_int()) }
+        // Safety: The caller is responsible for determining the indices are okay to read
+        unsafe { Self::gather_select_ptr(ptrs, enable, or) }
+    }
+
+    /// Read elementwise from pointers into a SIMD vector.
+    ///
+    /// # Safety
+    ///
+    /// Each read must satisfy the same conditions as [`core::ptr::read`].
+    ///
+    /// # Example
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, SimdConstPtr};
+    /// let values = [6, 2, 4, 9];
+    /// let offsets = Simd::from_array([1, 0, 0, 3]);
+    /// let source = Simd::splat(values.as_ptr()).wrapping_add(offsets);
+    /// let gathered = unsafe { Simd::gather_ptr(source) };
+    /// assert_eq!(gathered, Simd::from_array([2, 6, 6, 9]));
+    /// ```
+    #[must_use]
+    #[inline]
+    #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
+    pub unsafe fn gather_ptr(source: Simd<*const T, N>) -> Self
+    where
+        T: Default,
+    {
+        // TODO: add an intrinsic that doesn't use a passthru vector, and remove the T: Default bound
+        // Safety: The caller is responsible for upholding all invariants
+        unsafe { Self::gather_select_ptr(source, Mask::splat(true), Self::default()) }
+    }
+
+    /// Conditionally read elementwise from pointers into a SIMD vector.
+    /// The mask `enable`s all `true` pointers and disables all `false` pointers.
+    /// If a pointer is disabled, the element is selected from the `or` vector,
+    /// and no read is performed.
+    ///
+    /// # Safety
+    ///
+    /// Enabled elements must satisfy the same conditions as [`core::ptr::read`].
+    ///
+    /// # Example
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Mask, Simd, SimdConstPtr};
+    /// let values = [6, 2, 4, 9];
+    /// let enable = Mask::from_array([true, true, false, true]);
+    /// let offsets = Simd::from_array([1, 0, 0, 3]);
+    /// let source = Simd::splat(values.as_ptr()).wrapping_add(offsets);
+    /// let gathered = unsafe { Simd::gather_select_ptr(source, enable, Simd::splat(0)) };
+    /// assert_eq!(gathered, Simd::from_array([2, 6, 0, 9]));
+    /// ```
+    #[must_use]
+    #[inline]
+    #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
+    pub unsafe fn gather_select_ptr(
+        source: Simd<*const T, N>,
+        enable: Mask<isize, N>,
+        or: Self,
+    ) -> Self {
+        // Safety: The caller is responsible for upholding all invariants
+        unsafe { intrinsics::simd_gather(or, source, enable.to_int()) }
     }
 
     /// Writes the values in a SIMD vector to potentially discontiguous indices in `slice`.
-    /// If two lanes in the scattered vector would write to the same index
-    /// only the last lane is guaranteed to actually be written.
+    /// If an index is out-of-bounds, the write is suppressed without panicking.
+    /// If two elements in the scattered vector would write to the same index
+    /// only the last element is guaranteed to actually be written.
     ///
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
     /// # use core::simd::Simd;
     /// let mut vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
-    /// let idxs = Simd::from_array([9, 3, 0, 0]);
+    /// let idxs = Simd::from_array([9, 3, 0, 0]); // Note the duplicate index.
     /// let vals = Simd::from_array([-27, 82, -41, 124]);
     ///
-    /// vals.scatter(&mut vec, idxs); // index 0 receives two writes.
+    /// vals.scatter(&mut vec, idxs); // two logical writes means the last wins.
     /// assert_eq!(vec, vec![124, 11, 12, 82, 14, 15, 16, 17, 18]);
     /// ```
     #[inline]
-    pub fn scatter(self, slice: &mut [T], idxs: Simd<usize, LANES>) {
+    pub fn scatter(self, slice: &mut [T], idxs: Simd<usize, N>) {
         self.scatter_select(slice, Mask::splat(true), idxs)
     }
 
-    /// Writes the values in a SIMD vector to multiple potentially discontiguous indices in `slice`.
-    /// The mask `enable`s all `true` lanes and disables all `false` lanes.
-    /// If an enabled index is out-of-bounds, the lane is not written.
-    /// If two enabled lanes in the scattered vector would write to the same index,
-    /// only the last lane is guaranteed to actually be written.
+    /// Writes values from a SIMD vector to multiple potentially discontiguous indices in `slice`.
+    /// The mask `enable`s all `true` indices and disables all `false` indices.
+    /// If an enabled index is out-of-bounds, the write is suppressed without panicking.
+    /// If two enabled elements in the scattered vector would write to the same index,
+    /// only the last element is guaranteed to actually be written.
     ///
     /// # Examples
     /// ```
@@ -391,29 +579,24 @@ where
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
     /// # use simd::{Simd, Mask};
     /// let mut vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
-    /// let idxs = Simd::from_array([9, 3, 0, 0]);
+    /// let idxs = Simd::from_array([9, 3, 0, 0]); // Includes an out-of-bounds index
     /// let vals = Simd::from_array([-27, 82, -41, 124]);
-    /// let enable = Mask::from_array([true, true, true, false]); // Note the mask of the last lane.
+    /// let enable = Mask::from_array([true, true, true, false]); // Includes a masked element
     ///
-    /// vals.scatter_select(&mut vec, enable, idxs); // index 0's second write is masked, thus omitted.
+    /// vals.scatter_select(&mut vec, enable, idxs); // The last write is masked, thus omitted.
     /// assert_eq!(vec, vec![-41, 11, 12, 82, 14, 15, 16, 17, 18]);
     /// ```
     #[inline]
-    pub fn scatter_select(
-        self,
-        slice: &mut [T],
-        enable: Mask<isize, LANES>,
-        idxs: Simd<usize, LANES>,
-    ) {
-        let enable: Mask<isize, LANES> = enable & idxs.simd_lt(Simd::splat(slice.len()));
-        // Safety: We have masked-off out-of-bounds lanes.
+    pub fn scatter_select(self, slice: &mut [T], enable: Mask<isize, N>, idxs: Simd<usize, N>) {
+        let enable: Mask<isize, N> = enable & idxs.simd_lt(Simd::splat(slice.len()));
+        // Safety: We have masked-off out-of-bounds indices.
         unsafe { self.scatter_select_unchecked(slice, enable, idxs) }
     }
 
-    /// Writes the values in a SIMD vector to multiple potentially discontiguous indices in `slice`.
-    /// The mask `enable`s all `true` lanes and disables all `false` lanes.
-    /// If two enabled lanes in the scattered vector would write to the same index,
-    /// only the last lane is guaranteed to actually be written.
+    /// Writes values from a SIMD vector to multiple potentially discontiguous indices in `slice`.
+    /// The mask `enable`s all `true` indices and disables all `false` indices.
+    /// If two enabled elements in the scattered vector would write to the same index,
+    /// only the last element is guaranteed to actually be written.
     ///
     /// # Safety
     ///
@@ -429,22 +612,23 @@ where
     /// let mut vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
     /// let idxs = Simd::from_array([9, 3, 0, 0]);
     /// let vals = Simd::from_array([-27, 82, -41, 124]);
-    /// let enable = Mask::from_array([true, true, true, false]); // Note the mask of the last lane.
+    /// let enable = Mask::from_array([true, true, true, false]); // Masks the final index
     /// // If this mask was used to scatter, it would be unsound. Let's fix that.
     /// let enable = enable & idxs.simd_lt(Simd::splat(vec.len()));
     ///
-    /// // We have masked the OOB lane, so it's safe to scatter now.
+    /// // We have masked the OOB index, so it's safe to scatter now.
     /// unsafe { vals.scatter_select_unchecked(&mut vec, enable, idxs); }
-    /// // index 0's second write is masked, thus was omitted.
+    /// // The second write to index 0 was masked, thus omitted.
     /// assert_eq!(vec, vec![-41, 11, 12, 82, 14, 15, 16, 17, 18]);
     /// ```
     /// [undefined behavior]: https://doc.rust-lang.org/reference/behavior-considered-undefined.html
     #[inline]
+    #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
     pub unsafe fn scatter_select_unchecked(
         self,
         slice: &mut [T],
-        enable: Mask<isize, LANES>,
-        idxs: Simd<usize, LANES>,
+        enable: Mask<isize, N>,
+        idxs: Simd<usize, N>,
     ) {
         // Safety: This block works with *mut T derived from &mut 'a [T],
         // which means it is delicate in Rust's borrowing model, circa 2021:
@@ -458,36 +642,89 @@ where
         // 3. &mut [T] which will become our base ptr.
         unsafe {
             // Now Entering ☢️ *mut T Zone
-            let base_ptr = crate::simd::ptr::SimdMutPtr::splat(slice.as_mut_ptr());
+            let base_ptr = Simd::<*mut T, N>::splat(slice.as_mut_ptr());
             // Ferris forgive me, I have done pointer arithmetic here.
             let ptrs = base_ptr.wrapping_add(idxs);
             // The ptrs have been bounds-masked to prevent memory-unsafe writes insha'allah
-            intrinsics::simd_scatter(self, ptrs, enable.to_int())
+            self.scatter_select_ptr(ptrs, enable);
             // Cleared ☢️ *mut T Zone
         }
     }
+
+    /// Write pointers elementwise into a SIMD vector.
+    ///
+    /// # Safety
+    ///
+    /// Each write must satisfy the same conditions as [`core::ptr::write`].
+    ///
+    /// # Example
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, SimdMutPtr};
+    /// let mut values = [0; 4];
+    /// let offset = Simd::from_array([3, 2, 1, 0]);
+    /// let ptrs = Simd::splat(values.as_mut_ptr()).wrapping_add(offset);
+    /// unsafe { Simd::from_array([6, 3, 5, 7]).scatter_ptr(ptrs); }
+    /// assert_eq!(values, [7, 5, 3, 6]);
+    /// ```
+    #[inline]
+    #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
+    pub unsafe fn scatter_ptr(self, dest: Simd<*mut T, N>) {
+        // Safety: The caller is responsible for upholding all invariants
+        unsafe { self.scatter_select_ptr(dest, Mask::splat(true)) }
+    }
+
+    /// Conditionally write pointers elementwise into a SIMD vector.
+    /// The mask `enable`s all `true` pointers and disables all `false` pointers.
+    /// If a pointer is disabled, the write to its pointee is skipped.
+    ///
+    /// # Safety
+    ///
+    /// Enabled pointers must satisfy the same conditions as [`core::ptr::write`].
+    ///
+    /// # Example
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Mask, Simd, SimdMutPtr};
+    /// let mut values = [0; 4];
+    /// let offset = Simd::from_array([3, 2, 1, 0]);
+    /// let ptrs = Simd::splat(values.as_mut_ptr()).wrapping_add(offset);
+    /// let enable = Mask::from_array([true, true, false, false]);
+    /// unsafe { Simd::from_array([6, 3, 5, 7]).scatter_select_ptr(ptrs, enable); }
+    /// assert_eq!(values, [0, 0, 3, 6]);
+    /// ```
+    #[inline]
+    #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
+    pub unsafe fn scatter_select_ptr(self, dest: Simd<*mut T, N>, enable: Mask<isize, N>) {
+        // Safety: The caller is responsible for upholding all invariants
+        unsafe { intrinsics::simd_scatter(self, dest, enable.to_int()) }
+    }
 }
 
-impl<T, const LANES: usize> Copy for Simd<T, LANES>
+impl<T, const N: usize> Copy for Simd<T, N>
 where
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
-    LaneCount<LANES>: SupportedLaneCount,
 {
 }
 
-impl<T, const LANES: usize> Clone for Simd<T, LANES>
+impl<T, const N: usize> Clone for Simd<T, N>
 where
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
-    LaneCount<LANES>: SupportedLaneCount,
 {
     fn clone(&self) -> Self {
         *self
     }
 }
 
-impl<T, const LANES: usize> Default for Simd<T, LANES>
+impl<T, const N: usize> Default for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement + Default,
 {
     #[inline]
@@ -496,20 +733,20 @@ where
     }
 }
 
-impl<T, const LANES: usize> PartialEq for Simd<T, LANES>
+impl<T, const N: usize> PartialEq for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement + PartialEq,
 {
     #[inline]
     fn eq(&self, other: &Self) -> bool {
         // Safety: All SIMD vectors are SimdPartialEq, and the comparison produces a valid mask.
         let mask = unsafe {
-            let tfvec: Simd<<T as SimdElement>::Mask, LANES> = intrinsics::simd_eq(*self, *other);
+            let tfvec: Simd<<T as SimdElement>::Mask, N> = intrinsics::simd_eq(*self, *other);
             Mask::from_int_unchecked(tfvec)
         };
 
-        // Two vectors are equal if all lanes tested true for vertical equality.
+        // Two vectors are equal if all elements are equal when compared elementwise
         mask.all()
     }
 
@@ -518,18 +755,18 @@ where
     fn ne(&self, other: &Self) -> bool {
         // Safety: All SIMD vectors are SimdPartialEq, and the comparison produces a valid mask.
         let mask = unsafe {
-            let tfvec: Simd<<T as SimdElement>::Mask, LANES> = intrinsics::simd_ne(*self, *other);
+            let tfvec: Simd<<T as SimdElement>::Mask, N> = intrinsics::simd_ne(*self, *other);
             Mask::from_int_unchecked(tfvec)
         };
 
-        // Two vectors are non-equal if any lane tested true for vertical non-equality.
+        // Two vectors are non-equal if any elements are non-equal when compared elementwise
         mask.any()
     }
 }
 
-impl<T, const LANES: usize> PartialOrd for Simd<T, LANES>
+impl<T, const N: usize> PartialOrd for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement + PartialOrd,
 {
     #[inline]
@@ -539,16 +776,16 @@ where
     }
 }
 
-impl<T, const LANES: usize> Eq for Simd<T, LANES>
+impl<T, const N: usize> Eq for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement + Eq,
 {
 }
 
-impl<T, const LANES: usize> Ord for Simd<T, LANES>
+impl<T, const N: usize> Ord for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement + Ord,
 {
     #[inline]
@@ -558,9 +795,9 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::hash::Hash for Simd<T, LANES>
+impl<T, const N: usize> core::hash::Hash for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement + core::hash::Hash,
 {
     #[inline]
@@ -573,72 +810,96 @@ where
 }
 
 // array references
-impl<T, const LANES: usize> AsRef<[T; LANES]> for Simd<T, LANES>
+impl<T, const N: usize> AsRef<[T; N]> for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
     #[inline]
-    fn as_ref(&self) -> &[T; LANES] {
-        &self.0
+    fn as_ref(&self) -> &[T; N] {
+        self.as_array()
     }
 }
 
-impl<T, const LANES: usize> AsMut<[T; LANES]> for Simd<T, LANES>
+impl<T, const N: usize> AsMut<[T; N]> for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
     #[inline]
-    fn as_mut(&mut self) -> &mut [T; LANES] {
-        &mut self.0
+    fn as_mut(&mut self) -> &mut [T; N] {
+        self.as_mut_array()
     }
 }
 
 // slice references
-impl<T, const LANES: usize> AsRef<[T]> for Simd<T, LANES>
+impl<T, const N: usize> AsRef<[T]> for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
     #[inline]
     fn as_ref(&self) -> &[T] {
-        &self.0
+        self.as_array()
     }
 }
 
-impl<T, const LANES: usize> AsMut<[T]> for Simd<T, LANES>
+impl<T, const N: usize> AsMut<[T]> for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
     #[inline]
     fn as_mut(&mut self) -> &mut [T] {
-        &mut self.0
+        self.as_mut_array()
     }
 }
 
 // vector/array conversion
-impl<T, const LANES: usize> From<[T; LANES]> for Simd<T, LANES>
+impl<T, const N: usize> From<[T; N]> for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
-    fn from(array: [T; LANES]) -> Self {
-        Self(array)
+    fn from(array: [T; N]) -> Self {
+        Self::from_array(array)
     }
 }
 
-impl<T, const LANES: usize> From<Simd<T, LANES>> for [T; LANES]
+impl<T, const N: usize> From<Simd<T, N>> for [T; N]
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
-    fn from(vector: Simd<T, LANES>) -> Self {
+    fn from(vector: Simd<T, N>) -> Self {
         vector.to_array()
     }
 }
 
+impl<T, const N: usize> TryFrom<&[T]> for Simd<T, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+    T: SimdElement,
+{
+    type Error = core::array::TryFromSliceError;
+
+    fn try_from(slice: &[T]) -> Result<Self, core::array::TryFromSliceError> {
+        Ok(Self::from_array(slice.try_into()?))
+    }
+}
+
+impl<T, const N: usize> TryFrom<&mut [T]> for Simd<T, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+    T: SimdElement,
+{
+    type Error = core::array::TryFromSliceError;
+
+    fn try_from(slice: &mut [T]) -> Result<Self, core::array::TryFromSliceError> {
+        Ok(Self::from_array(slice.try_into()?))
+    }
+}
+
 mod sealed {
     pub trait Sealed {}
 }
@@ -740,3 +1001,27 @@ impl Sealed for f64 {}
 unsafe impl SimdElement for f64 {
     type Mask = i64;
 }
+
+impl<T> Sealed for *const T {}
+
+// Safety: (thin) const pointers are valid SIMD element types, and are supported by this API
+//
+// Fat pointers may be supported in the future.
+unsafe impl<T> SimdElement for *const T
+where
+    T: core::ptr::Pointee<Metadata = ()>,
+{
+    type Mask = isize;
+}
+
+impl<T> Sealed for *mut T {}
+
+// Safety: (thin) mut pointers are valid SIMD element types, and are supported by this API
+//
+// Fat pointers may be supported in the future.
+unsafe impl<T> SimdElement for *mut T
+where
+    T: core::ptr::Pointee<Metadata = ()>,
+{
+    type Mask = isize;
+}
diff --git a/crates/core_simd/src/vector/float.rs b/crates/core_simd/src/vector/float.rs
deleted file mode 100644
index f836c99b1e2..00000000000
--- a/crates/core_simd/src/vector/float.rs
+++ /dev/null
@@ -1,24 +0,0 @@
-#![allow(non_camel_case_types)]
-
-use crate::simd::Simd;
-
-/// A 64-bit SIMD vector with two elements of type `f32`.
-pub type f32x2 = Simd<f32, 2>;
-
-/// A 128-bit SIMD vector with four elements of type `f32`.
-pub type f32x4 = Simd<f32, 4>;
-
-/// A 256-bit SIMD vector with eight elements of type `f32`.
-pub type f32x8 = Simd<f32, 8>;
-
-/// A 512-bit SIMD vector with 16 elements of type `f32`.
-pub type f32x16 = Simd<f32, 16>;
-
-/// A 128-bit SIMD vector with two elements of type `f64`.
-pub type f64x2 = Simd<f64, 2>;
-
-/// A 256-bit SIMD vector with four elements of type `f64`.
-pub type f64x4 = Simd<f64, 4>;
-
-/// A 512-bit SIMD vector with eight elements of type `f64`.
-pub type f64x8 = Simd<f64, 8>;
diff --git a/crates/core_simd/src/vector/int.rs b/crates/core_simd/src/vector/int.rs
deleted file mode 100644
index 20e56c7dc64..00000000000
--- a/crates/core_simd/src/vector/int.rs
+++ /dev/null
@@ -1,63 +0,0 @@
-#![allow(non_camel_case_types)]
-
-use crate::simd::Simd;
-
-/// A SIMD vector with two elements of type `isize`.
-pub type isizex2 = Simd<isize, 2>;
-
-/// A SIMD vector with four elements of type `isize`.
-pub type isizex4 = Simd<isize, 4>;
-
-/// A SIMD vector with eight elements of type `isize`.
-pub type isizex8 = Simd<isize, 8>;
-
-/// A 32-bit SIMD vector with two elements of type `i16`.
-pub type i16x2 = Simd<i16, 2>;
-
-/// A 64-bit SIMD vector with four elements of type `i16`.
-pub type i16x4 = Simd<i16, 4>;
-
-/// A 128-bit SIMD vector with eight elements of type `i16`.
-pub type i16x8 = Simd<i16, 8>;
-
-/// A 256-bit SIMD vector with 16 elements of type `i16`.
-pub type i16x16 = Simd<i16, 16>;
-
-/// A 512-bit SIMD vector with 32 elements of type `i16`.
-pub type i16x32 = Simd<i16, 32>;
-
-/// A 64-bit SIMD vector with two elements of type `i32`.
-pub type i32x2 = Simd<i32, 2>;
-
-/// A 128-bit SIMD vector with four elements of type `i32`.
-pub type i32x4 = Simd<i32, 4>;
-
-/// A 256-bit SIMD vector with eight elements of type `i32`.
-pub type i32x8 = Simd<i32, 8>;
-
-/// A 512-bit SIMD vector with 16 elements of type `i32`.
-pub type i32x16 = Simd<i32, 16>;
-
-/// A 128-bit SIMD vector with two elements of type `i64`.
-pub type i64x2 = Simd<i64, 2>;
-
-/// A 256-bit SIMD vector with four elements of type `i64`.
-pub type i64x4 = Simd<i64, 4>;
-
-/// A 512-bit SIMD vector with eight elements of type `i64`.
-pub type i64x8 = Simd<i64, 8>;
-
-/// A 32-bit SIMD vector with four elements of type `i8`.
-pub type i8x4 = Simd<i8, 4>;
-
-/// A 64-bit SIMD vector with eight elements of type `i8`.
-pub type i8x8 = Simd<i8, 8>;
-
-/// A 128-bit SIMD vector with 16 elements of type `i8`.
-pub type i8x16 = Simd<i8, 16>;
-
-/// A 256-bit SIMD vector with 32 elements of type `i8`.
-pub type i8x32 = Simd<i8, 32>;
-
-/// A 512-bit SIMD vector with 64 elements of type `i8`.
-pub type i8x64 = Simd<i8, 64>;
diff --git a/crates/core_simd/src/vector/ptr.rs b/crates/core_simd/src/vector/ptr.rs
deleted file mode 100644
index fa756344db9..00000000000
--- a/crates/core_simd/src/vector/ptr.rs
+++ /dev/null
@@ -1,51 +0,0 @@
-//! Private implementation details of public gather/scatter APIs.
-use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Simd, SupportedLaneCount};
-
-/// A vector of *const T.
-#[derive(Debug, Copy, Clone)]
-#[repr(simd)]
-pub(crate) struct SimdConstPtr<T, const LANES: usize>([*const T; LANES]);
-
-impl<T, const LANES: usize> SimdConstPtr<T, LANES>
-where
-    LaneCount<LANES>: SupportedLaneCount,
-    T: Sized,
-{
-    #[inline]
-    #[must_use]
-    pub fn splat(ptr: *const T) -> Self {
-        Self([ptr; LANES])
-    }
-
-    #[inline]
-    #[must_use]
-    pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
-        // Safety: this intrinsic doesn't have a precondition
-        unsafe { intrinsics::simd_arith_offset(self, addend) }
-    }
-}
-
-/// A vector of *mut T. Be very careful around potential aliasing.
-#[derive(Debug, Copy, Clone)]
-#[repr(simd)]
-pub(crate) struct SimdMutPtr<T, const LANES: usize>([*mut T; LANES]);
-
-impl<T, const LANES: usize> SimdMutPtr<T, LANES>
-where
-    LaneCount<LANES>: SupportedLaneCount,
-    T: Sized,
-{
-    #[inline]
-    #[must_use]
-    pub fn splat(ptr: *mut T) -> Self {
-        Self([ptr; LANES])
-    }
-
-    #[inline]
-    #[must_use]
-    pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
-        // Safety: this intrinsic doesn't have a precondition
-        unsafe { intrinsics::simd_arith_offset(self, addend) }
-    }
-}
diff --git a/crates/core_simd/src/vector/uint.rs b/crates/core_simd/src/vector/uint.rs
deleted file mode 100644
index b4a69c44363..00000000000
--- a/crates/core_simd/src/vector/uint.rs
+++ /dev/null
@@ -1,63 +0,0 @@
-#![allow(non_camel_case_types)]
-
-use crate::simd::Simd;
-
-/// A SIMD vector with two elements of type `usize`.
-pub type usizex2 = Simd<usize, 2>;
-
-/// A SIMD vector with four elements of type `usize`.
-pub type usizex4 = Simd<usize, 4>;
-
-/// A SIMD vector with eight elements of type `usize`.
-pub type usizex8 = Simd<usize, 8>;
-
-/// A 32-bit SIMD vector with two elements of type `u16`.
-pub type u16x2 = Simd<u16, 2>;
-
-/// A 64-bit SIMD vector with four elements of type `u16`.
-pub type u16x4 = Simd<u16, 4>;
-
-/// A 128-bit SIMD vector with eight elements of type `u16`.
-pub type u16x8 = Simd<u16, 8>;
-
-/// A 256-bit SIMD vector with 16 elements of type `u16`.
-pub type u16x16 = Simd<u16, 16>;
-
-/// A 512-bit SIMD vector with 32 elements of type `u16`.
-pub type u16x32 = Simd<u16, 32>;
-
-/// A 64-bit SIMD vector with two elements of type `u32`.
-pub type u32x2 = Simd<u32, 2>;
-
-/// A 128-bit SIMD vector with four elements of type `u32`.
-pub type u32x4 = Simd<u32, 4>;
-
-/// A 256-bit SIMD vector with eight elements of type `u32`.
-pub type u32x8 = Simd<u32, 8>;
-
-/// A 512-bit SIMD vector with 16 elements of type `u32`.
-pub type u32x16 = Simd<u32, 16>;
-
-/// A 128-bit SIMD vector with two elements of type `u64`.
-pub type u64x2 = Simd<u64, 2>;
-
-/// A 256-bit SIMD vector with four elements of type `u64`.
-pub type u64x4 = Simd<u64, 4>;
-
-/// A 512-bit SIMD vector with eight elements of type `u64`.
-pub type u64x8 = Simd<u64, 8>;
-
-/// A 32-bit SIMD vector with four elements of type `u8`.
-pub type u8x4 = Simd<u8, 4>;
-
-/// A 64-bit SIMD vector with eight elements of type `u8`.
-pub type u8x8 = Simd<u8, 8>;
-
-/// A 128-bit SIMD vector with 16 elements of type `u8`.
-pub type u8x16 = Simd<u8, 16>;
-
-/// A 256-bit SIMD vector with 32 elements of type `u8`.
-pub type u8x32 = Simd<u8, 32>;
-
-/// A 512-bit SIMD vector with 64 elements of type `u8`.
-pub type u8x64 = Simd<u8, 64>;
diff --git a/crates/core_simd/tests/autoderef.rs b/crates/core_simd/tests/autoderef.rs
index 9359da16ee5..3181826ef59 100644
--- a/crates/core_simd/tests/autoderef.rs
+++ b/crates/core_simd/tests/autoderef.rs
@@ -1,6 +1,6 @@
 // Test that we handle all our "auto-deref" cases correctly.
 #![feature(portable_simd)]
-use core_simd::f32x4;
+use core_simd::simd::f32x4;
 
 #[cfg(target_arch = "wasm32")]
 use wasm_bindgen_test::*;
diff --git a/crates/core_simd/tests/mask_ops_impl/mask_macros.rs b/crates/core_simd/tests/mask_ops_impl/mask_macros.rs
index 795f9e27c44..faafa5fa51f 100644
--- a/crates/core_simd/tests/mask_ops_impl/mask_macros.rs
+++ b/crates/core_simd/tests/mask_ops_impl/mask_macros.rs
@@ -2,7 +2,7 @@ macro_rules! mask_tests {
     { $vector:ident, $lanes:literal } => {
         #[cfg(test)]
         mod $vector {
-            use core_simd::$vector as Vector;
+            use core_simd::simd::$vector as Vector;
             const LANES: usize = $lanes;
 
             #[cfg(target_arch = "wasm32")]
diff --git a/crates/core_simd/tests/masks.rs b/crates/core_simd/tests/masks.rs
index 673d0db93fe..9f8bad1c36c 100644
--- a/crates/core_simd/tests/masks.rs
+++ b/crates/core_simd/tests/masks.rs
@@ -13,11 +13,13 @@ macro_rules! test_mask_api {
             #[cfg(target_arch = "wasm32")]
             use wasm_bindgen_test::*;
 
+            use core_simd::simd::Mask;
+
             #[test]
             #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
             fn set_and_test() {
                 let values = [true, false, false, true, false, false, true, false];
-                let mut mask = core_simd::Mask::<$type, 8>::splat(false);
+                let mut mask = Mask::<$type, 8>::splat(false);
                 for (lane, value) in values.iter().copied().enumerate() {
                     mask.set(lane, value);
                 }
@@ -29,7 +31,7 @@ macro_rules! test_mask_api {
             #[test]
             #[should_panic]
             fn set_invalid_lane() {
-                let mut mask = core_simd::Mask::<$type, 8>::splat(false);
+                let mut mask = Mask::<$type, 8>::splat(false);
                 mask.set(8, true);
                 let _ = mask;
             }
@@ -37,24 +39,24 @@ macro_rules! test_mask_api {
             #[test]
             #[should_panic]
             fn test_invalid_lane() {
-                let mask = core_simd::Mask::<$type, 8>::splat(false);
+                let mask = Mask::<$type, 8>::splat(false);
                 let _ = mask.test(8);
             }
 
             #[test]
             fn any() {
-                assert!(!core_simd::Mask::<$type, 8>::splat(false).any());
-                assert!(core_simd::Mask::<$type, 8>::splat(true).any());
-                let mut v = core_simd::Mask::<$type, 8>::splat(false);
+                assert!(!Mask::<$type, 8>::splat(false).any());
+                assert!(Mask::<$type, 8>::splat(true).any());
+                let mut v = Mask::<$type, 8>::splat(false);
                 v.set(2, true);
                 assert!(v.any());
             }
 
             #[test]
             fn all() {
-                assert!(!core_simd::Mask::<$type, 8>::splat(false).all());
-                assert!(core_simd::Mask::<$type, 8>::splat(true).all());
-                let mut v = core_simd::Mask::<$type, 8>::splat(false);
+                assert!(!Mask::<$type, 8>::splat(false).all());
+                assert!(Mask::<$type, 8>::splat(true).all());
+                let mut v = Mask::<$type, 8>::splat(false);
                 v.set(2, true);
                 assert!(!v.all());
             }
@@ -62,57 +64,57 @@ macro_rules! test_mask_api {
             #[test]
             fn roundtrip_int_conversion() {
                 let values = [true, false, false, true, false, false, true, false];
-                let mask = core_simd::Mask::<$type, 8>::from_array(values);
+                let mask = Mask::<$type, 8>::from_array(values);
                 let int = mask.to_int();
                 assert_eq!(int.to_array(), [-1, 0, 0, -1, 0, 0, -1, 0]);
-                assert_eq!(core_simd::Mask::<$type, 8>::from_int(int), mask);
+                assert_eq!(Mask::<$type, 8>::from_int(int), mask);
             }
 
             #[test]
             fn roundtrip_bitmask_conversion() {
-                use core_simd::ToBitMask;
+                use core_simd::simd::ToBitMask;
                 let values = [
                     true, false, false, true, false, false, true, false,
                     true, true, false, false, false, false, false, true,
                 ];
-                let mask = core_simd::Mask::<$type, 16>::from_array(values);
+                let mask = Mask::<$type, 16>::from_array(values);
                 let bitmask = mask.to_bitmask();
                 assert_eq!(bitmask, 0b1000001101001001);
-                assert_eq!(core_simd::Mask::<$type, 16>::from_bitmask(bitmask), mask);
+                assert_eq!(Mask::<$type, 16>::from_bitmask(bitmask), mask);
             }
 
             #[test]
             fn roundtrip_bitmask_conversion_short() {
-                use core_simd::ToBitMask;
+                use core_simd::simd::ToBitMask;
 
                 let values = [
                     false, false, false, true,
                 ];
-                let mask = core_simd::Mask::<$type, 4>::from_array(values);
+                let mask = Mask::<$type, 4>::from_array(values);
                 let bitmask = mask.to_bitmask();
                 assert_eq!(bitmask, 0b1000);
-                assert_eq!(core_simd::Mask::<$type, 4>::from_bitmask(bitmask), mask);
+                assert_eq!(Mask::<$type, 4>::from_bitmask(bitmask), mask);
 
                 let values = [true, false];
-                let mask = core_simd::Mask::<$type, 2>::from_array(values);
+                let mask = Mask::<$type, 2>::from_array(values);
                 let bitmask = mask.to_bitmask();
                 assert_eq!(bitmask, 0b01);
-                assert_eq!(core_simd::Mask::<$type, 2>::from_bitmask(bitmask), mask);
+                assert_eq!(Mask::<$type, 2>::from_bitmask(bitmask), mask);
             }
 
             #[test]
             fn cast() {
-                fn cast_impl<T: core_simd::MaskElement>()
+                fn cast_impl<T: core_simd::simd::MaskElement>()
                 where
-                    core_simd::Mask<$type, 8>: Into<core_simd::Mask<T, 8>>,
+                    Mask<$type, 8>: Into<Mask<T, 8>>,
                 {
                     let values = [true, false, false, true, false, false, true, false];
-                    let mask = core_simd::Mask::<$type, 8>::from_array(values);
+                    let mask = Mask::<$type, 8>::from_array(values);
 
                     let cast_mask = mask.cast::<T>();
                     assert_eq!(values, cast_mask.to_array());
 
-                    let into_mask: core_simd::Mask<T, 8> = mask.into();
+                    let into_mask: Mask<T, 8> = mask.into();
                     assert_eq!(values, into_mask.to_array());
                 }
 
@@ -126,15 +128,15 @@ macro_rules! test_mask_api {
             #[cfg(feature = "generic_const_exprs")]
             #[test]
             fn roundtrip_bitmask_array_conversion() {
-                use core_simd::ToBitMaskArray;
+                use core_simd::simd::ToBitMaskArray;
                 let values = [
                     true, false, false, true, false, false, true, false,
                     true, true, false, false, false, false, false, true,
                 ];
-                let mask = core_simd::Mask::<$type, 16>::from_array(values);
+                let mask = Mask::<$type, 16>::from_array(values);
                 let bitmask = mask.to_bitmask_array();
                 assert_eq!(bitmask, [0b01001001, 0b10000011]);
-                assert_eq!(core_simd::Mask::<$type, 16>::from_bitmask_array(bitmask), mask);
+                assert_eq!(Mask::<$type, 16>::from_bitmask_array(bitmask), mask);
             }
         }
     }
@@ -150,9 +152,10 @@ mod mask_api {
 
 #[test]
 fn convert() {
+    use core_simd::simd::Mask;
     let values = [true, false, false, true, false, false, true, false];
     assert_eq!(
-        core_simd::Mask::<i8, 8>::from_array(values),
-        core_simd::Mask::<i32, 8>::from_array(values).into()
+        Mask::<i8, 8>::from_array(values),
+        Mask::<i32, 8>::from_array(values).into()
     );
 }
diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index f759394d075..3a02f3f01e1 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -7,7 +7,7 @@ macro_rules! impl_unary_op_test {
         test_helpers::test_lanes! {
             fn $fn<const LANES: usize>() {
                 test_helpers::test_unary_elementwise(
-                    &<core_simd::Simd<$scalar, LANES> as core::ops::$trait>::$fn,
+                    &<core_simd::simd::Simd<$scalar, LANES> as core::ops::$trait>::$fn,
                     &$scalar_fn,
                     &|_| true,
                 );
@@ -27,7 +27,7 @@ macro_rules! impl_binary_op_test {
     { $scalar:ty, $trait:ident :: $fn:ident, $trait_assign:ident :: $fn_assign:ident, $scalar_fn:expr } => {
         mod $fn {
             use super::*;
-            use core_simd::Simd;
+            use core_simd::simd::Simd;
 
             test_helpers::test_lanes! {
                 fn normal<const LANES: usize>() {
@@ -64,7 +64,7 @@ macro_rules! impl_binary_checked_op_test {
     { $scalar:ty, $trait:ident :: $fn:ident, $trait_assign:ident :: $fn_assign:ident, $scalar_fn:expr, $check_fn:expr } => {
         mod $fn {
             use super::*;
-            use core_simd::Simd;
+            use core_simd::simd::Simd;
 
             test_helpers::test_lanes! {
                 fn normal<const LANES: usize>() {
@@ -173,7 +173,7 @@ macro_rules! impl_signed_tests {
     { $scalar:tt } => {
         mod $scalar {
             use core_simd::simd::SimdInt;
-            type Vector<const LANES: usize> = core_simd::Simd<Scalar, LANES>;
+            type Vector<const LANES: usize> = core_simd::simd::Simd<Scalar, LANES>;
             type Scalar = $scalar;
 
             impl_common_integer_tests! { Vector, Scalar }
@@ -314,7 +314,7 @@ macro_rules! impl_unsigned_tests {
     { $scalar:tt } => {
         mod $scalar {
             use core_simd::simd::SimdUint;
-            type Vector<const LANES: usize> = core_simd::Simd<Scalar, LANES>;
+            type Vector<const LANES: usize> = core_simd::simd::Simd<Scalar, LANES>;
             type Scalar = $scalar;
 
             impl_common_integer_tests! { Vector, Scalar }
@@ -348,8 +348,8 @@ macro_rules! impl_unsigned_tests {
 macro_rules! impl_float_tests {
     { $scalar:tt, $int_scalar:tt } => {
         mod $scalar {
-            use core_simd::SimdFloat;
-            type Vector<const LANES: usize> = core_simd::Simd<Scalar, LANES>;
+            use core_simd::simd::SimdFloat;
+            type Vector<const LANES: usize> = core_simd::simd::Simd<Scalar, LANES>;
             type Scalar = $scalar;
 
             impl_unary_op_test!(Scalar, Neg::neg);
diff --git a/crates/core_simd/tests/pointers.rs b/crates/core_simd/tests/pointers.rs
new file mode 100644
index 00000000000..0ae8f83b8b9
--- /dev/null
+++ b/crates/core_simd/tests/pointers.rs
@@ -0,0 +1,111 @@
+#![feature(portable_simd, strict_provenance)]
+
+use core_simd::simd::{Simd, SimdConstPtr, SimdMutPtr};
+
+macro_rules! common_tests {
+    { $constness:ident } => {
+        test_helpers::test_lanes! {
+            fn is_null<const LANES: usize>() {
+                test_helpers::test_unary_mask_elementwise(
+                    &Simd::<*$constness u32, LANES>::is_null,
+                    &<*$constness u32>::is_null,
+                    &|_| true,
+                );
+            }
+
+            fn addr<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &Simd::<*$constness u32, LANES>::addr,
+                    &<*$constness u32>::addr,
+                    &|_| true,
+                );
+            }
+
+            fn with_addr<const LANES: usize>() {
+                test_helpers::test_binary_elementwise(
+                    &Simd::<*$constness u32, LANES>::with_addr,
+                    &<*$constness u32>::with_addr,
+                    &|_, _| true,
+                );
+            }
+
+            fn expose_addr<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &Simd::<*$constness u32, LANES>::expose_addr,
+                    &<*$constness u32>::expose_addr,
+                    &|_| true,
+                );
+            }
+
+            fn wrapping_offset<const LANES: usize>() {
+                test_helpers::test_binary_elementwise(
+                    &Simd::<*$constness u32, LANES>::wrapping_offset,
+                    &<*$constness u32>::wrapping_offset,
+                    &|_, _| true,
+                );
+            }
+
+            fn wrapping_add<const LANES: usize>() {
+                test_helpers::test_binary_elementwise(
+                    &Simd::<*$constness u32, LANES>::wrapping_add,
+                    &<*$constness u32>::wrapping_add,
+                    &|_, _| true,
+                );
+            }
+
+            fn wrapping_sub<const LANES: usize>() {
+                test_helpers::test_binary_elementwise(
+                    &Simd::<*$constness u32, LANES>::wrapping_sub,
+                    &<*$constness u32>::wrapping_sub,
+                    &|_, _| true,
+                );
+            }
+        }
+    }
+}
+
+mod const_ptr {
+    use super::*;
+    common_tests! { const }
+
+    test_helpers::test_lanes! {
+        fn cast_mut<const LANES: usize>() {
+            test_helpers::test_unary_elementwise(
+                &Simd::<*const u32, LANES>::cast_mut,
+                &<*const u32>::cast_mut,
+                &|_| true,
+            );
+        }
+
+        fn from_exposed_addr<const LANES: usize>() {
+            test_helpers::test_unary_elementwise(
+                &Simd::<*const u32, LANES>::from_exposed_addr,
+                &core::ptr::from_exposed_addr::<u32>,
+                &|_| true,
+            );
+        }
+    }
+}
+
+mod mut_ptr {
+    use super::*;
+    common_tests! { mut }
+
+    test_helpers::test_lanes! {
+        fn cast_const<const LANES: usize>() {
+            test_helpers::test_unary_elementwise(
+                &Simd::<*mut u32, LANES>::cast_const,
+                &<*mut u32>::cast_const,
+                &|_| true,
+            );
+        }
+
+        fn from_exposed_addr<const LANES: usize>() {
+            test_helpers::test_unary_elementwise(
+                &Simd::<*mut u32, LANES>::from_exposed_addr,
+                &core::ptr::from_exposed_addr_mut::<u32>,
+                &|_| true,
+            );
+        }
+    }
+}
diff --git a/crates/core_simd/tests/round.rs b/crates/core_simd/tests/round.rs
index 484fd5bf47d..8b9638ad466 100644
--- a/crates/core_simd/tests/round.rs
+++ b/crates/core_simd/tests/round.rs
@@ -5,7 +5,7 @@ macro_rules! float_rounding_test {
         mod $scalar {
             use std_float::StdFloat;
 
-            type Vector<const LANES: usize> = core_simd::Simd<$scalar, LANES>;
+            type Vector<const LANES: usize> = core_simd::simd::Simd<$scalar, LANES>;
             type Scalar = $scalar;
             type IntScalar = $int_scalar;
 
diff --git a/crates/core_simd/tests/swizzle.rs b/crates/core_simd/tests/swizzle.rs
index 51c63611aba..8cd7c33e823 100644
--- a/crates/core_simd/tests/swizzle.rs
+++ b/crates/core_simd/tests/swizzle.rs
@@ -1,5 +1,5 @@
 #![feature(portable_simd)]
-use core_simd::{Simd, Swizzle};
+use core_simd::simd::{Simd, Swizzle};
 
 #[cfg(target_arch = "wasm32")]
 use wasm_bindgen_test::*;
@@ -60,3 +60,17 @@ fn interleave() {
     assert_eq!(even, a);
     assert_eq!(odd, b);
 }
+
+// portable-simd#298
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn interleave_one() {
+    let a = Simd::from_array([0]);
+    let b = Simd::from_array([1]);
+    let (lo, hi) = a.interleave(b);
+    assert_eq!(lo.to_array(), [0]);
+    assert_eq!(hi.to_array(), [1]);
+    let (even, odd) = lo.deinterleave(hi);
+    assert_eq!(even, a);
+    assert_eq!(odd, b);
+}
diff --git a/crates/core_simd/tests/swizzle_dyn.rs b/crates/core_simd/tests/swizzle_dyn.rs
new file mode 100644
index 00000000000..646cd5f3383
--- /dev/null
+++ b/crates/core_simd/tests/swizzle_dyn.rs
@@ -0,0 +1,74 @@
+#![feature(portable_simd)]
+use core::{fmt, ops::RangeInclusive};
+use proptest;
+use test_helpers::{self, biteq, make_runner, prop_assert_biteq};
+
+fn swizzle_dyn_scalar_ver<const N: usize>(values: [u8; N], idxs: [u8; N]) -> [u8; N] {
+    let mut array = [0; N];
+    for (i, k) in idxs.into_iter().enumerate() {
+        if (k as usize) < N {
+            array[i] = values[k as usize];
+        };
+    }
+    array
+}
+
+test_helpers::test_lanes! {
+    fn swizzle_dyn<const N: usize>() {
+        match_simd_with_fallback(
+            &core_simd::simd::Simd::<u8, N>::swizzle_dyn,
+            &swizzle_dyn_scalar_ver,
+            &|_, _| true,
+        );
+    }
+}
+
+fn match_simd_with_fallback<Scalar, ScalarResult, Vector, VectorResult, const N: usize>(
+    fv: &dyn Fn(Vector, Vector) -> VectorResult,
+    fs: &dyn Fn([Scalar; N], [Scalar; N]) -> [ScalarResult; N],
+    check: &dyn Fn([Scalar; N], [Scalar; N]) -> bool,
+) where
+    Scalar: Copy + fmt::Debug + SwizzleStrategy,
+    ScalarResult: Copy + biteq::BitEq + fmt::Debug + SwizzleStrategy,
+    Vector: Into<[Scalar; N]> + From<[Scalar; N]> + Copy,
+    VectorResult: Into<[ScalarResult; N]> + From<[ScalarResult; N]> + Copy,
+{
+    test_swizzles_2(&|x: [Scalar; N], y: [Scalar; N]| {
+        proptest::prop_assume!(check(x, y));
+        let result_v: [ScalarResult; N] = fv(x.into(), y.into()).into();
+        let result_s: [ScalarResult; N] = fs(x, y);
+        crate::prop_assert_biteq!(result_v, result_s);
+        Ok(())
+    });
+}
+
+fn test_swizzles_2<A: fmt::Debug + SwizzleStrategy, B: fmt::Debug + SwizzleStrategy>(
+    f: &dyn Fn(A, B) -> proptest::test_runner::TestCaseResult,
+) {
+    let mut runner = make_runner();
+    runner
+        .run(
+            &(A::swizzled_strategy(), B::swizzled_strategy()),
+            |(a, b)| f(a, b),
+        )
+        .unwrap();
+}
+
+pub trait SwizzleStrategy {
+    type Strategy: proptest::strategy::Strategy<Value = Self>;
+    fn swizzled_strategy() -> Self::Strategy;
+}
+
+impl SwizzleStrategy for u8 {
+    type Strategy = RangeInclusive<u8>;
+    fn swizzled_strategy() -> Self::Strategy {
+        0..=64
+    }
+}
+
+impl<T: fmt::Debug + SwizzleStrategy, const N: usize> SwizzleStrategy for [T; N] {
+    type Strategy = test_helpers::array::UniformArrayStrategy<T::Strategy, Self>;
+    fn swizzled_strategy() -> Self::Strategy {
+        Self::Strategy::new(T::swizzled_strategy())
+    }
+}
diff --git a/crates/core_simd/tests/to_bytes.rs b/crates/core_simd/tests/to_bytes.rs
index debb4335e2c..be0ee4349c5 100644
--- a/crates/core_simd/tests/to_bytes.rs
+++ b/crates/core_simd/tests/to_bytes.rs
@@ -2,7 +2,7 @@
 #![allow(incomplete_features)]
 #![cfg(feature = "generic_const_exprs")]
 
-use core_simd::Simd;
+use core_simd::simd::Simd;
 
 #[test]
 fn byte_convert() {
diff --git a/crates/core_simd/tests/try_from_slice.rs b/crates/core_simd/tests/try_from_slice.rs
new file mode 100644
index 00000000000..859e3b94f2c
--- /dev/null
+++ b/crates/core_simd/tests/try_from_slice.rs
@@ -0,0 +1,25 @@
+#![feature(portable_simd)]
+
+#[cfg(target_arch = "wasm32")]
+use wasm_bindgen_test::*;
+
+#[cfg(target_arch = "wasm32")]
+wasm_bindgen_test_configure!(run_in_browser);
+
+use core_simd::simd::i32x4;
+
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn try_from_slice() {
+    // Equal length
+    assert_eq!(
+        i32x4::try_from([1, 2, 3, 4].as_slice()).unwrap(),
+        i32x4::from_array([1, 2, 3, 4])
+    );
+
+    // Slice length > vector length
+    assert!(i32x4::try_from([1, 2, 3, 4, 5].as_slice()).is_err());
+
+    // Slice length < vector length
+    assert!(i32x4::try_from([1, 2, 3].as_slice()).is_err());
+}
diff --git a/crates/test_helpers/Cargo.toml b/crates/test_helpers/Cargo.toml
index a04b0961d7f..1d2bc8b519a 100644
--- a/crates/test_helpers/Cargo.toml
+++ b/crates/test_helpers/Cargo.toml
@@ -8,3 +8,6 @@ publish = false
 version = "0.10"
 default-features = false
 features = ["alloc"]
+
+[features]
+all_lane_counts = []
diff --git a/crates/test_helpers/src/array.rs b/crates/test_helpers/src/array.rs
index 5ffc9226976..984a427320d 100644
--- a/crates/test_helpers/src/array.rs
+++ b/crates/test_helpers/src/array.rs
@@ -41,6 +41,7 @@ where
 
     fn new_tree(&self, runner: &mut TestRunner) -> NewTree<Self> {
         let tree: [S::Tree; LANES] = unsafe {
+            #[allow(clippy::uninit_assumed_init)]
             let mut tree: [MaybeUninit<S::Tree>; LANES] = MaybeUninit::uninit().assume_init();
             for t in tree.iter_mut() {
                 *t = MaybeUninit::new(self.strategy.new_tree(runner)?)
@@ -60,6 +61,7 @@ impl<T: ValueTree, const LANES: usize> ValueTree for ArrayValueTree<[T; LANES]>
 
     fn current(&self) -> Self::Value {
         unsafe {
+            #[allow(clippy::uninit_assumed_init)]
             let mut value: [MaybeUninit<T::Value>; LANES] = MaybeUninit::uninit().assume_init();
             for (tree_elem, value_elem) in self.tree.iter().zip(value.iter_mut()) {
                 *value_elem = MaybeUninit::new(tree_elem.current());
diff --git a/crates/test_helpers/src/biteq.rs b/crates/test_helpers/src/biteq.rs
index 00350e22418..7d91260d838 100644
--- a/crates/test_helpers/src/biteq.rs
+++ b/crates/test_helpers/src/biteq.rs
@@ -55,6 +55,26 @@ macro_rules! impl_float_biteq {
 
 impl_float_biteq! { f32, f64 }
 
+impl<T> BitEq for *const T {
+    fn biteq(&self, other: &Self) -> bool {
+        self == other
+    }
+
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+impl<T> BitEq for *mut T {
+    fn biteq(&self, other: &Self) -> bool {
+        self == other
+    }
+
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
 impl<T: BitEq, const N: usize> BitEq for [T; N] {
     fn biteq(&self, other: &Self) -> bool {
         self.iter()
diff --git a/crates/test_helpers/src/lib.rs b/crates/test_helpers/src/lib.rs
index 141bee18a9a..b26cdc311a2 100644
--- a/crates/test_helpers/src/lib.rs
+++ b/crates/test_helpers/src/lib.rs
@@ -38,6 +38,28 @@ impl_num! { usize }
 impl_num! { f32 }
 impl_num! { f64 }
 
+impl<T> DefaultStrategy for *const T {
+    type Strategy = proptest::strategy::Map<proptest::num::isize::Any, fn(isize) -> *const T>;
+    fn default_strategy() -> Self::Strategy {
+        fn map<T>(x: isize) -> *const T {
+            x as _
+        }
+        use proptest::strategy::Strategy;
+        proptest::num::isize::ANY.prop_map(map)
+    }
+}
+
+impl<T> DefaultStrategy for *mut T {
+    type Strategy = proptest::strategy::Map<proptest::num::isize::Any, fn(isize) -> *mut T>;
+    fn default_strategy() -> Self::Strategy {
+        fn map<T>(x: isize) -> *mut T {
+            x as _
+        }
+        use proptest::strategy::Strategy;
+        proptest::num::isize::ANY.prop_map(map)
+    }
+}
+
 #[cfg(not(target_arch = "wasm32"))]
 impl DefaultStrategy for u128 {
     type Strategy = proptest::num::u128::Any;
@@ -135,21 +157,21 @@ pub fn test_unary_elementwise<Scalar, ScalarResult, Vector, VectorResult, const
     fs: &dyn Fn(Scalar) -> ScalarResult,
     check: &dyn Fn([Scalar; LANES]) -> bool,
 ) where
-    Scalar: Copy + Default + core::fmt::Debug + DefaultStrategy,
-    ScalarResult: Copy + Default + biteq::BitEq + core::fmt::Debug + DefaultStrategy,
+    Scalar: Copy + core::fmt::Debug + DefaultStrategy,
+    ScalarResult: Copy + biteq::BitEq + core::fmt::Debug + DefaultStrategy,
     Vector: Into<[Scalar; LANES]> + From<[Scalar; LANES]> + Copy,
     VectorResult: Into<[ScalarResult; LANES]> + From<[ScalarResult; LANES]> + Copy,
 {
     test_1(&|x: [Scalar; LANES]| {
         proptest::prop_assume!(check(x));
         let result_1: [ScalarResult; LANES] = fv(x.into()).into();
-        let result_2: [ScalarResult; LANES] = {
-            let mut result = [ScalarResult::default(); LANES];
-            for (i, o) in x.iter().zip(result.iter_mut()) {
-                *o = fs(*i);
-            }
-            result
-        };
+        let result_2: [ScalarResult; LANES] = x
+            .iter()
+            .copied()
+            .map(fs)
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
         crate::prop_assert_biteq!(result_1, result_2);
         Ok(())
     });
@@ -162,7 +184,7 @@ pub fn test_unary_mask_elementwise<Scalar, Vector, Mask, const LANES: usize>(
     fs: &dyn Fn(Scalar) -> bool,
     check: &dyn Fn([Scalar; LANES]) -> bool,
 ) where
-    Scalar: Copy + Default + core::fmt::Debug + DefaultStrategy,
+    Scalar: Copy + core::fmt::Debug + DefaultStrategy,
     Vector: Into<[Scalar; LANES]> + From<[Scalar; LANES]> + Copy,
     Mask: Into<[bool; LANES]> + From<[bool; LANES]> + Copy,
 {
@@ -196,9 +218,9 @@ pub fn test_binary_elementwise<
     fs: &dyn Fn(Scalar1, Scalar2) -> ScalarResult,
     check: &dyn Fn([Scalar1; LANES], [Scalar2; LANES]) -> bool,
 ) where
-    Scalar1: Copy + Default + core::fmt::Debug + DefaultStrategy,
-    Scalar2: Copy + Default + core::fmt::Debug + DefaultStrategy,
-    ScalarResult: Copy + Default + biteq::BitEq + core::fmt::Debug + DefaultStrategy,
+    Scalar1: Copy + core::fmt::Debug + DefaultStrategy,
+    Scalar2: Copy + core::fmt::Debug + DefaultStrategy,
+    ScalarResult: Copy + biteq::BitEq + core::fmt::Debug + DefaultStrategy,
     Vector1: Into<[Scalar1; LANES]> + From<[Scalar1; LANES]> + Copy,
     Vector2: Into<[Scalar2; LANES]> + From<[Scalar2; LANES]> + Copy,
     VectorResult: Into<[ScalarResult; LANES]> + From<[ScalarResult; LANES]> + Copy,
@@ -206,13 +228,14 @@ pub fn test_binary_elementwise<
     test_2(&|x: [Scalar1; LANES], y: [Scalar2; LANES]| {
         proptest::prop_assume!(check(x, y));
         let result_1: [ScalarResult; LANES] = fv(x.into(), y.into()).into();
-        let result_2: [ScalarResult; LANES] = {
-            let mut result = [ScalarResult::default(); LANES];
-            for ((i1, i2), o) in x.iter().zip(y.iter()).zip(result.iter_mut()) {
-                *o = fs(*i1, *i2);
-            }
-            result
-        };
+        let result_2: [ScalarResult; LANES] = x
+            .iter()
+            .copied()
+            .zip(y.iter().copied())
+            .map(|(x, y)| fs(x, y))
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
         crate::prop_assert_biteq!(result_1, result_2);
         Ok(())
     });
@@ -333,6 +356,39 @@ pub fn test_ternary_elementwise<
     );
 }
 
+#[doc(hidden)]
+#[macro_export]
+macro_rules! test_lanes_helper {
+    ($($(#[$meta:meta])* $fn_name:ident $lanes:literal;)+) => {
+        $(
+            #[test]
+            $(#[$meta])*
+            fn $fn_name() {
+                implementation::<$lanes>();
+            }
+        )+
+    };
+    (
+        $(#[$meta:meta])+;
+        $($(#[$meta_before:meta])+ $fn_name_before:ident $lanes_before:literal;)*
+        $fn_name:ident $lanes:literal;
+        $($fn_name_rest:ident $lanes_rest:literal;)*
+    ) => {
+        $crate::test_lanes_helper!(
+            $(#[$meta])+;
+            $($(#[$meta_before])+ $fn_name_before $lanes_before;)*
+            $(#[$meta])+ $fn_name $lanes;
+            $($fn_name_rest $lanes_rest;)*
+        );
+    };
+    (
+        $(#[$meta_ignored:meta])+;
+        $($(#[$meta:meta])+ $fn_name:ident $lanes:literal;)+
+    ) => {
+        $crate::test_lanes_helper!($($(#[$meta])+ $fn_name $lanes;)+);
+    };
+}
+
 /// Expand a const-generic test into separate tests for each possible lane count.
 #[macro_export]
 macro_rules! test_lanes {
@@ -345,57 +401,96 @@ macro_rules! test_lanes {
 
                 fn implementation<const $lanes: usize>()
                 where
-                    core_simd::LaneCount<$lanes>: core_simd::SupportedLaneCount,
+                    core_simd::simd::LaneCount<$lanes>: core_simd::simd::SupportedLaneCount,
                 $body
 
                 #[cfg(target_arch = "wasm32")]
                 wasm_bindgen_test::wasm_bindgen_test_configure!(run_in_browser);
 
-                #[test]
-                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
-                fn lanes_1() {
-                    implementation::<1>();
-                }
+                $crate::test_lanes_helper!(
+                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
+                    lanes_1 1;
+                    lanes_2 2;
+                    lanes_4 4;
+                );
 
-                #[test]
-                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
-                fn lanes_2() {
-                    implementation::<2>();
-                }
-
-                #[test]
-                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
-                fn lanes_4() {
-                    implementation::<4>();
-                }
-
-                #[test]
-                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
                 #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
-                fn lanes_8() {
-                    implementation::<8>();
-                }
+                $crate::test_lanes_helper!(
+                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
+                    lanes_8 8;
+                    lanes_16 16;
+                    lanes_32 32;
+                    lanes_64 64;
+                );
 
-                #[test]
-                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
-                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
-                fn lanes_16() {
-                    implementation::<16>();
-                }
+                #[cfg(feature = "all_lane_counts")]
+                $crate::test_lanes_helper!(
+                    // test some odd and even non-power-of-2 lengths on miri
+                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
+                    lanes_3 3;
+                    lanes_5 5;
+                    lanes_6 6;
+                );
 
-                #[test]
-                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+                #[cfg(feature = "all_lane_counts")]
                 #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
-                fn lanes_32() {
-                    implementation::<32>();
-                }
-
-                #[test]
-                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
-                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
-                fn lanes_64() {
-                    implementation::<64>();
-                }
+                $crate::test_lanes_helper!(
+                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
+                    lanes_7 7;
+                    lanes_9 9;
+                    lanes_10 10;
+                    lanes_11 11;
+                    lanes_12 12;
+                    lanes_13 13;
+                    lanes_14 14;
+                    lanes_15 15;
+                    lanes_17 17;
+                    lanes_18 18;
+                    lanes_19 19;
+                    lanes_20 20;
+                    lanes_21 21;
+                    lanes_22 22;
+                    lanes_23 23;
+                    lanes_24 24;
+                    lanes_25 25;
+                    lanes_26 26;
+                    lanes_27 27;
+                    lanes_28 28;
+                    lanes_29 29;
+                    lanes_30 30;
+                    lanes_31 31;
+                    lanes_33 33;
+                    lanes_34 34;
+                    lanes_35 35;
+                    lanes_36 36;
+                    lanes_37 37;
+                    lanes_38 38;
+                    lanes_39 39;
+                    lanes_40 40;
+                    lanes_41 41;
+                    lanes_42 42;
+                    lanes_43 43;
+                    lanes_44 44;
+                    lanes_45 45;
+                    lanes_46 46;
+                    lanes_47 47;
+                    lanes_48 48;
+                    lanes_49 49;
+                    lanes_50 50;
+                    lanes_51 51;
+                    lanes_52 52;
+                    lanes_53 53;
+                    lanes_54 54;
+                    lanes_55 55;
+                    lanes_56 56;
+                    lanes_57 57;
+                    lanes_58 58;
+                    lanes_59 59;
+                    lanes_60 60;
+                    lanes_61 61;
+                    lanes_62 62;
+                    lanes_63 63;
+                );
             }
         )*
     }
@@ -413,50 +508,93 @@ macro_rules! test_lanes_panic {
 
                 fn implementation<const $lanes: usize>()
                 where
-                    core_simd::LaneCount<$lanes>: core_simd::SupportedLaneCount,
+                    core_simd::simd::LaneCount<$lanes>: core_simd::simd::SupportedLaneCount,
                 $body
 
-                #[test]
-                #[should_panic]
-                fn lanes_1() {
-                    implementation::<1>();
-                }
+                $crate::test_lanes_helper!(
+                    #[should_panic];
+                    lanes_1 1;
+                    lanes_2 2;
+                    lanes_4 4;
+                );
 
-                #[test]
-                #[should_panic]
-                fn lanes_2() {
-                    implementation::<2>();
-                }
+                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
+                $crate::test_lanes_helper!(
+                    #[should_panic];
+                    lanes_8 8;
+                    lanes_16 16;
+                    lanes_32 32;
+                    lanes_64 64;
+                );
 
-                #[test]
-                #[should_panic]
-                fn lanes_4() {
-                    implementation::<4>();
-                }
+                #[cfg(feature = "all_lane_counts")]
+                $crate::test_lanes_helper!(
+                    // test some odd and even non-power-of-2 lengths on miri
+                    #[should_panic];
+                    lanes_3 3;
+                    lanes_5 5;
+                    lanes_6 6;
+                );
 
-                #[test]
-                #[should_panic]
-                fn lanes_8() {
-                    implementation::<8>();
-                }
-
-                #[test]
-                #[should_panic]
-                fn lanes_16() {
-                    implementation::<16>();
-                }
-
-                #[test]
-                #[should_panic]
-                fn lanes_32() {
-                    implementation::<32>();
-                }
-
-                #[test]
-                #[should_panic]
-                fn lanes_64() {
-                    implementation::<64>();
-                }
+                #[cfg(feature = "all_lane_counts")]
+                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
+                $crate::test_lanes_helper!(
+                    #[should_panic];
+                    lanes_7 7;
+                    lanes_9 9;
+                    lanes_10 10;
+                    lanes_11 11;
+                    lanes_12 12;
+                    lanes_13 13;
+                    lanes_14 14;
+                    lanes_15 15;
+                    lanes_17 17;
+                    lanes_18 18;
+                    lanes_19 19;
+                    lanes_20 20;
+                    lanes_21 21;
+                    lanes_22 22;
+                    lanes_23 23;
+                    lanes_24 24;
+                    lanes_25 25;
+                    lanes_26 26;
+                    lanes_27 27;
+                    lanes_28 28;
+                    lanes_29 29;
+                    lanes_30 30;
+                    lanes_31 31;
+                    lanes_33 33;
+                    lanes_34 34;
+                    lanes_35 35;
+                    lanes_36 36;
+                    lanes_37 37;
+                    lanes_38 38;
+                    lanes_39 39;
+                    lanes_40 40;
+                    lanes_41 41;
+                    lanes_42 42;
+                    lanes_43 43;
+                    lanes_44 44;
+                    lanes_45 45;
+                    lanes_46 46;
+                    lanes_47 47;
+                    lanes_48 48;
+                    lanes_49 49;
+                    lanes_50 50;
+                    lanes_51 51;
+                    lanes_52 52;
+                    lanes_53 53;
+                    lanes_54 54;
+                    lanes_55 55;
+                    lanes_56 56;
+                    lanes_57 57;
+                    lanes_58 58;
+                    lanes_59 59;
+                    lanes_60 60;
+                    lanes_61 61;
+                    lanes_62 62;
+                    lanes_63 63;
+                );
             }
         )*
     }

From c7f6aae79c804e75a42e5aa23c7f82b8a9cf787c Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Thu, 11 May 2023 17:22:00 -0700
Subject: [PATCH 20/35] Correct swizzle_dyn cfg for armv7 neon

---
 crates/core_simd/src/swizzle_dyn.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
index 3eb80d5dca1..6065d645937 100644
--- a/crates/core_simd/src/swizzle_dyn.rs
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -18,7 +18,7 @@ where
         #![allow(unused_imports, unused_unsafe)]
         #[cfg(target_arch = "aarch64")]
         use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8};
-        #[cfg(all(target_arch = "arm", target_feature = "v7"))]
+        #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
         use core::arch::arm::{uint8x8_t, vtbl1_u8};
         #[cfg(target_arch = "wasm32")]
         use core::arch::wasm32 as wasm;

From b3b5cfca660d8926b5dd6e69c82c263381acab5e Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Tue, 2 May 2023 21:17:22 -0400
Subject: [PATCH 21/35] Add a prelude

---
 crates/core_simd/src/mod.rs          |  2 +
 crates/core_simd/src/simd/prelude.rs | 79 ++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 crates/core_simd/src/simd/prelude.rs

diff --git a/crates/core_simd/src/mod.rs b/crates/core_simd/src/mod.rs
index 35c659b7a42..f9891a3b7c1 100644
--- a/crates/core_simd/src/mod.rs
+++ b/crates/core_simd/src/mod.rs
@@ -23,6 +23,8 @@ mod vendor;
 
 #[doc = include_str!("core_simd_docs.md")]
 pub mod simd {
+    pub mod prelude;
+
     pub(crate) use crate::core_simd::intrinsics;
 
     pub use crate::core_simd::alias::*;
diff --git a/crates/core_simd/src/simd/prelude.rs b/crates/core_simd/src/simd/prelude.rs
new file mode 100644
index 00000000000..7d4f0798afa
--- /dev/null
+++ b/crates/core_simd/src/simd/prelude.rs
@@ -0,0 +1,79 @@
+//! The portable SIMD prelude.
+//!
+//! Includes important traits and types to be imported with a glob:
+//! ```
+//! use std::simd::prelude::*;
+//! ```
+
+#[doc(no_inline)]
+pub use super::{
+    SimdConstPtr, SimdFloat, SimdInt, SimdMutPtr, SimdOrd, SimdPartialEq, SimdPartialOrd, SimdUint,
+};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{f32x1, f32x2, f32x4, f32x8, f32x16, f32x32, f32x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{f64x1, f64x2, f64x4, f64x8, f64x16, f64x32, f64x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{i8x1, i8x2, i8x4, i8x8, i8x16, i8x32, i8x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{i16x1, i16x2, i16x4, i16x8, i16x16, i16x32, i16x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{i32x1, i32x2, i32x4, i32x8, i32x16, i32x32, i32x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{i64x1, i64x2, i64x4, i64x8, i64x16, i64x32, i64x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{isizex1, isizex2, isizex4, isizex8, isizex16, isizex32, isizex64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{u8x1, u8x2, u8x4, u8x8, u8x16, u8x32, u8x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{u16x1, u16x2, u16x4, u16x8, u16x16, u16x32, u16x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{u32x1, u32x2, u32x4, u32x8, u32x16, u32x32, u32x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{u64x1, u64x2, u64x4, u64x8, u64x16, u64x32, u64x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{usizex1, usizex2, usizex4, usizex8, usizex16, usizex32, usizex64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{mask8x1, mask8x2, mask8x4, mask8x8, mask8x16, mask8x32, mask8x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{mask16x1, mask16x2, mask16x4, mask16x8, mask16x16, mask16x32, mask16x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{mask32x1, mask32x2, mask32x4, mask32x8, mask32x16, mask32x32, mask32x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{mask64x1, mask64x2, mask64x4, mask64x8, mask64x16, mask64x32, mask64x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{masksizex1, masksizex2, masksizex4, masksizex8, masksizex16, masksizex32, masksizex64};

From 9e818d62b3b2c12bf516229555990ecb85d249f5 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Tue, 2 May 2023 22:32:20 -0400
Subject: [PATCH 22/35] Ignore doctest

---
 crates/core_simd/src/simd/prelude.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/simd/prelude.rs b/crates/core_simd/src/simd/prelude.rs
index 7d4f0798afa..bdaed3ba067 100644
--- a/crates/core_simd/src/simd/prelude.rs
+++ b/crates/core_simd/src/simd/prelude.rs
@@ -1,7 +1,7 @@
 //! The portable SIMD prelude.
 //!
 //! Includes important traits and types to be imported with a glob:
-//! ```
+//! ```ignore
 //! use std::simd::prelude::*;
 //! ```
 

From c55e19cb00f65580a81a59a6f8d31ee29c59ea7e Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Tue, 2 May 2023 22:33:01 -0400
Subject: [PATCH 23/35] Add Simd, Mask, simd_swizzle to prelude

Co-authored-by: Jacob Lifshay <programmerjake@gmail.com>
---
 crates/core_simd/src/simd/prelude.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/crates/core_simd/src/simd/prelude.rs b/crates/core_simd/src/simd/prelude.rs
index bdaed3ba067..e8fdc932d49 100644
--- a/crates/core_simd/src/simd/prelude.rs
+++ b/crates/core_simd/src/simd/prelude.rs
@@ -7,7 +7,8 @@
 
 #[doc(no_inline)]
 pub use super::{
-    SimdConstPtr, SimdFloat, SimdInt, SimdMutPtr, SimdOrd, SimdPartialEq, SimdPartialOrd, SimdUint,
+    simd_swizzle, Mask, Simd, SimdConstPtr, SimdFloat, SimdInt, SimdMutPtr, SimdOrd, SimdPartialEq,
+    SimdPartialOrd, SimdUint,
 };
 
 #[rustfmt::skip]

From 048264e0a32ca492cbc2d742e170f924e6f5ce13 Mon Sep 17 00:00:00 2001
From: David Tolnay <dtolnay@gmail.com>
Date: Sat, 4 Feb 2023 19:12:01 -0800
Subject: [PATCH 24/35] Hide repr attribute from doc of types without
 guaranteed repr

---
 crates/core_simd/src/masks.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index e58df80fca8..e0f3c7beef6 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -88,7 +88,7 @@ impl_element! { isize }
 /// The layout of this type is unspecified, and may change between platforms
 /// and/or Rust versions, and code should not assume that it is equivalent to
 /// `[T; LANES]`.
-#[repr(transparent)]
+#[cfg_attr(not(doc), repr(transparent))] // work around https://github.com/rust-lang/rust/issues/90435
 pub struct Mask<T, const LANES: usize>(mask_impl::Mask<T, LANES>)
 where
     T: MaskElement,

From 6626cd824936e10e492957537c26f28d0739b567 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sat, 20 May 2023 17:17:56 -0400
Subject: [PATCH 25/35] Remove cast_ptr in favor of cast which acts like
 pointer::cast. Move number casts to number traits.

---
 crates/core_simd/src/cast.rs               | 74 ++++++++++-----------
 crates/core_simd/src/elements/const_ptr.rs | 30 +++++++--
 crates/core_simd/src/elements/float.rs     | 67 ++++++++++++++++++-
 crates/core_simd/src/elements/int.rs       | 19 +++++-
 crates/core_simd/src/elements/mut_ptr.rs   | 30 +++++++--
 crates/core_simd/src/elements/uint.rs      | 19 +++++-
 crates/core_simd/src/vector.rs             | 75 +---------------------
 crates/core_simd/tests/cast.rs             |  3 +-
 crates/core_simd/tests/round.rs            |  1 +
 9 files changed, 194 insertions(+), 124 deletions(-)

diff --git a/crates/core_simd/src/cast.rs b/crates/core_simd/src/cast.rs
index 65a3f845ffc..1c3592f8075 100644
--- a/crates/core_simd/src/cast.rs
+++ b/crates/core_simd/src/cast.rs
@@ -1,55 +1,51 @@
 use crate::simd::SimdElement;
 
+mod sealed {
+    /// Cast vector elements to other types.
+    ///
+    /// # Safety
+    /// Implementing this trait asserts that the type is a valid vector element for the `simd_cast`
+    /// or `simd_as` intrinsics.
+    pub unsafe trait Sealed {}
+}
+use sealed::Sealed;
+
 /// Supporting trait for `Simd::cast`.  Typically doesn't need to be used directly.
-///
-/// # Safety
-/// Implementing this trait asserts that the type is a valid vector element for the `simd_cast` or
-/// `simd_as` intrinsics.
-pub unsafe trait SimdCast: SimdElement {}
+pub trait SimdCast: Sealed + SimdElement {}
 
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for i8 {}
+unsafe impl Sealed for i8 {}
+impl SimdCast for i8 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for i16 {}
+unsafe impl Sealed for i16 {}
+impl SimdCast for i16 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for i32 {}
+unsafe impl Sealed for i32 {}
+impl SimdCast for i32 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for i64 {}
+unsafe impl Sealed for i64 {}
+impl SimdCast for i64 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for isize {}
+unsafe impl Sealed for isize {}
+impl SimdCast for isize {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for u8 {}
+unsafe impl Sealed for u8 {}
+impl SimdCast for u8 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for u16 {}
+unsafe impl Sealed for u16 {}
+impl SimdCast for u16 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for u32 {}
+unsafe impl Sealed for u32 {}
+impl SimdCast for u32 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for u64 {}
+unsafe impl Sealed for u64 {}
+impl SimdCast for u64 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for usize {}
+unsafe impl Sealed for usize {}
+impl SimdCast for usize {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for f32 {}
+unsafe impl Sealed for f32 {}
+impl SimdCast for f32 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for f64 {}
-
-/// Supporting trait for `Simd::cast_ptr`.  Typically doesn't need to be used directly.
-///
-/// # Safety
-/// Implementing this trait asserts that the type is a valid vector element for the `simd_cast_ptr`
-/// intrinsic.
-pub unsafe trait SimdCastPtr<T> {}
-
-// Safety: pointers can be cast to other pointer types
-unsafe impl<T, U> SimdCastPtr<T> for *const U
-where
-    U: core::ptr::Pointee,
-    T: core::ptr::Pointee<Metadata = U::Metadata>,
-{
-}
-// Safety: pointers can be cast to other pointer types
-unsafe impl<T, U> SimdCastPtr<T> for *mut U
-where
-    U: core::ptr::Pointee,
-    T: core::ptr::Pointee<Metadata = U::Metadata>,
-{
-}
+unsafe impl Sealed for f64 {}
+impl SimdCast for f64 {}
diff --git a/crates/core_simd/src/elements/const_ptr.rs b/crates/core_simd/src/elements/const_ptr.rs
index 0ef9802b5e2..f215f9a61d0 100644
--- a/crates/core_simd/src/elements/const_ptr.rs
+++ b/crates/core_simd/src/elements/const_ptr.rs
@@ -1,5 +1,5 @@
 use super::sealed::Sealed;
-use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
+use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SimdUint, SupportedLaneCount};
 
 /// Operations on SIMD vectors of constant pointers.
 pub trait SimdConstPtr: Copy + Sealed {
@@ -9,6 +9,9 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// Vector of `isize` with the same number of lanes.
     type Isize;
 
+    /// Vector of const pointers with the same number of lanes.
+    type CastPtr<T>;
+
     /// Vector of mutable pointers to the same type.
     type MutPtr;
 
@@ -18,6 +21,11 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// Returns `true` for each lane that is null.
     fn is_null(self) -> Self::Mask;
 
+    /// Casts to a pointer of another type.
+    ///
+    /// Equivalent to calling [`pointer::cast`] on each lane.
+    fn cast<T>(self) -> Self::CastPtr<T>;
+
     /// Changes constness without changing the type.
     ///
     /// Equivalent to calling [`pointer::cast_mut`] on each lane.
@@ -78,6 +86,7 @@ where
 {
     type Usize = Simd<usize, LANES>;
     type Isize = Simd<isize, LANES>;
+    type CastPtr<U> = Simd<*const U, LANES>;
     type MutPtr = Simd<*mut T, LANES>;
     type Mask = Mask<isize, LANES>;
 
@@ -86,9 +95,22 @@ where
         Simd::splat(core::ptr::null()).simd_eq(self)
     }
 
+    #[inline]
+    fn cast<U>(self) -> Self::CastPtr<U> {
+        // SimdElement currently requires zero-sized metadata, so this should never fail.
+        // If this ever changes, `simd_cast_ptr` should produce a post-mono error.
+        use core::{mem::size_of, ptr::Pointee};
+        assert_eq!(size_of::<<T as Pointee>::Metadata>(), 0);
+        assert_eq!(size_of::<<U as Pointee>::Metadata>(), 0);
+
+        // Safety: pointers can be cast
+        unsafe { intrinsics::simd_cast_ptr(self) }
+    }
+
     #[inline]
     fn cast_mut(self) -> Self::MutPtr {
-        self.cast_ptr()
+        // Safety: pointers can be cast
+        unsafe { intrinsics::simd_cast_ptr(self) }
     }
 
     #[inline]
@@ -106,9 +128,9 @@ where
         // In the mean-time, this operation is defined to be "as if" it was
         // a wrapping_offset, so we can emulate it as such. This should properly
         // restore pointer provenance even under today's compiler.
-        self.cast_ptr::<*const u8>()
+        self.cast::<u8>()
             .wrapping_offset(addr.cast::<isize>() - self.addr().cast::<isize>())
-            .cast_ptr()
+            .cast()
     }
 
     #[inline]
diff --git a/crates/core_simd/src/elements/float.rs b/crates/core_simd/src/elements/float.rs
index d6022327055..f6f6f51de53 100644
--- a/crates/core_simd/src/elements/float.rs
+++ b/crates/core_simd/src/elements/float.rs
@@ -1,6 +1,6 @@
 use super::sealed::Sealed;
 use crate::simd::{
-    intrinsics, LaneCount, Mask, Simd, SimdElement, SimdPartialEq, SimdPartialOrd,
+    intrinsics, LaneCount, Mask, Simd, SimdCast, SimdElement, SimdPartialEq, SimdPartialOrd,
     SupportedLaneCount,
 };
 
@@ -15,6 +15,52 @@ pub trait SimdFloat: Copy + Sealed {
     /// Bit representation of this SIMD vector type.
     type Bits;
 
+    /// A SIMD vector with a different element type.
+    type Cast<T: SimdElement>;
+
+    /// Performs elementwise conversion of this vector's elements to another SIMD-valid type.
+    ///
+    /// This follows the semantics of Rust's `as` conversion for floats (truncating or saturating
+    /// at the limits) for each element.
+    ///
+    /// # Example
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # use core::simd::Simd;
+    /// let floats: Simd<f32, 4> = Simd::from_array([1.9, -4.5, f32::INFINITY, f32::NAN]);
+    /// let ints = floats.cast::<i32>();
+    /// assert_eq!(ints, Simd::from_array([1, -4, i32::MAX, 0]));
+    ///
+    /// // Formally equivalent, but `Simd::cast` can optimize better.
+    /// assert_eq!(ints, Simd::from_array(floats.to_array().map(|x| x as i32)));
+    ///
+    /// // The float conversion does not round-trip.
+    /// let floats_again = ints.cast();
+    /// assert_ne!(floats, floats_again);
+    /// assert_eq!(floats_again, Simd::from_array([1.0, -4.0, 2147483647.0, 0.0]));
+    /// ```
+    #[must_use]
+    fn cast<T: SimdCast>(self) -> Self::Cast<T>;
+
+    /// Rounds toward zero and converts to the same-width integer type, assuming that
+    /// the value is finite and fits in that type.
+    ///
+    /// # Safety
+    /// The value must:
+    ///
+    /// * Not be NaN
+    /// * Not be infinite
+    /// * Be representable in the return type, after truncating off its fractional part
+    ///
+    /// If these requirements are infeasible or costly, consider using the safe function [cast],
+    /// which saturates on conversion.
+    ///
+    /// [cast]: Simd::cast
+    unsafe fn to_int_unchecked<I>(self) -> Self::Cast<I>
+    where
+        Self::Scalar: core::convert::FloatToInt<I> + SimdCast,
+        I: SimdCast;
+
     /// Raw transmutation to an unsigned integer vector type with the
     /// same size and number of lanes.
     #[must_use = "method returns a new vector and does not mutate the original value"]
@@ -206,6 +252,25 @@ macro_rules! impl_trait {
             type Mask = Mask<<$mask_ty as SimdElement>::Mask, LANES>;
             type Scalar = $ty;
             type Bits = Simd<$bits_ty, LANES>;
+            type Cast<T: SimdElement> = Simd<T, LANES>;
+
+            #[inline]
+            fn cast<T: SimdCast>(self) -> Self::Cast<T>
+            {
+                // Safety: supported types are guaranteed by SimdCast
+                unsafe { intrinsics::simd_as(self) }
+            }
+
+            #[inline]
+            #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
+            unsafe fn to_int_unchecked<I>(self) -> Self::Cast<I>
+            where
+                Self::Scalar: core::convert::FloatToInt<I> + SimdCast,
+                I: SimdCast,
+            {
+                // Safety: supported types are guaranteed by SimdCast, the caller is responsible for the extra invariants
+                unsafe { intrinsics::simd_cast(self) }
+            }
 
             #[inline]
             fn to_bits(self) -> Simd<$bits_ty, LANES> {
diff --git a/crates/core_simd/src/elements/int.rs b/crates/core_simd/src/elements/int.rs
index 9b8c37ed466..6db89ff9a65 100644
--- a/crates/core_simd/src/elements/int.rs
+++ b/crates/core_simd/src/elements/int.rs
@@ -1,6 +1,6 @@
 use super::sealed::Sealed;
 use crate::simd::{
-    intrinsics, LaneCount, Mask, Simd, SimdElement, SimdPartialOrd, SupportedLaneCount,
+    intrinsics, LaneCount, Mask, Simd, SimdCast, SimdElement, SimdPartialOrd, SupportedLaneCount,
 };
 
 /// Operations on SIMD vectors of signed integers.
@@ -11,6 +11,16 @@ pub trait SimdInt: Copy + Sealed {
     /// Scalar type contained by this SIMD vector type.
     type Scalar;
 
+    /// A SIMD vector with a different element type.
+    type Cast<T: SimdElement>;
+
+    /// Performs elementwise conversion of this vector's elements to another SIMD-valid type.
+    ///
+    /// This follows the semantics of Rust's `as` conversion for casting integers (wrapping to
+    /// other integer types, and saturating to float types).
+    #[must_use]
+    fn cast<T: SimdCast>(self) -> Self::Cast<T>;
+
     /// Lanewise saturating add.
     ///
     /// # Examples
@@ -198,6 +208,13 @@ macro_rules! impl_trait {
         {
             type Mask = Mask<<$ty as SimdElement>::Mask, LANES>;
             type Scalar = $ty;
+            type Cast<T: SimdElement> = Simd<T, LANES>;
+
+            #[inline]
+            fn cast<T: SimdCast>(self) -> Self::Cast<T> {
+                // Safety: supported types are guaranteed by SimdCast
+                unsafe { intrinsics::simd_as(self) }
+            }
 
             #[inline]
             fn saturating_add(self, second: Self) -> Self {
diff --git a/crates/core_simd/src/elements/mut_ptr.rs b/crates/core_simd/src/elements/mut_ptr.rs
index d87986b4a09..4bdc6a14ce4 100644
--- a/crates/core_simd/src/elements/mut_ptr.rs
+++ b/crates/core_simd/src/elements/mut_ptr.rs
@@ -1,5 +1,5 @@
 use super::sealed::Sealed;
-use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
+use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SimdUint, SupportedLaneCount};
 
 /// Operations on SIMD vectors of mutable pointers.
 pub trait SimdMutPtr: Copy + Sealed {
@@ -9,6 +9,9 @@ pub trait SimdMutPtr: Copy + Sealed {
     /// Vector of `isize` with the same number of lanes.
     type Isize;
 
+    /// Vector of const pointers with the same number of lanes.
+    type CastPtr<T>;
+
     /// Vector of constant pointers to the same type.
     type ConstPtr;
 
@@ -18,6 +21,11 @@ pub trait SimdMutPtr: Copy + Sealed {
     /// Returns `true` for each lane that is null.
     fn is_null(self) -> Self::Mask;
 
+    /// Casts to a pointer of another type.
+    ///
+    /// Equivalent to calling [`pointer::cast`] on each lane.
+    fn cast<T>(self) -> Self::CastPtr<T>;
+
     /// Changes constness without changing the type.
     ///
     /// Equivalent to calling [`pointer::cast_const`] on each lane.
@@ -73,6 +81,7 @@ where
 {
     type Usize = Simd<usize, LANES>;
     type Isize = Simd<isize, LANES>;
+    type CastPtr<U> = Simd<*mut U, LANES>;
     type ConstPtr = Simd<*const T, LANES>;
     type Mask = Mask<isize, LANES>;
 
@@ -81,9 +90,22 @@ where
         Simd::splat(core::ptr::null_mut()).simd_eq(self)
     }
 
+    #[inline]
+    fn cast<U>(self) -> Self::CastPtr<U> {
+        // SimdElement currently requires zero-sized metadata, so this should never fail.
+        // If this ever changes, `simd_cast_ptr` should produce a post-mono error.
+        use core::{mem::size_of, ptr::Pointee};
+        assert_eq!(size_of::<<T as Pointee>::Metadata>(), 0);
+        assert_eq!(size_of::<<U as Pointee>::Metadata>(), 0);
+
+        // Safety: pointers can be cast
+        unsafe { intrinsics::simd_cast_ptr(self) }
+    }
+
     #[inline]
     fn cast_const(self) -> Self::ConstPtr {
-        self.cast_ptr()
+        // Safety: pointers can be cast
+        unsafe { intrinsics::simd_cast_ptr(self) }
     }
 
     #[inline]
@@ -101,9 +123,9 @@ where
         // In the mean-time, this operation is defined to be "as if" it was
         // a wrapping_offset, so we can emulate it as such. This should properly
         // restore pointer provenance even under today's compiler.
-        self.cast_ptr::<*mut u8>()
+        self.cast::<u8>()
             .wrapping_offset(addr.cast::<isize>() - self.addr().cast::<isize>())
-            .cast_ptr()
+            .cast()
     }
 
     #[inline]
diff --git a/crates/core_simd/src/elements/uint.rs b/crates/core_simd/src/elements/uint.rs
index 21e7e76eb3d..3926c395ec9 100644
--- a/crates/core_simd/src/elements/uint.rs
+++ b/crates/core_simd/src/elements/uint.rs
@@ -1,11 +1,21 @@
 use super::sealed::Sealed;
-use crate::simd::{intrinsics, LaneCount, Simd, SupportedLaneCount};
+use crate::simd::{intrinsics, LaneCount, Simd, SimdCast, SimdElement, SupportedLaneCount};
 
 /// Operations on SIMD vectors of unsigned integers.
 pub trait SimdUint: Copy + Sealed {
     /// Scalar type contained by this SIMD vector type.
     type Scalar;
 
+    /// A SIMD vector with a different element type.
+    type Cast<T: SimdElement>;
+
+    /// Performs elementwise conversion of this vector's elements to another SIMD-valid type.
+    ///
+    /// This follows the semantics of Rust's `as` conversion for casting integers (wrapping to
+    /// other integer types, and saturating to float types).
+    #[must_use]
+    fn cast<T: SimdCast>(self) -> Self::Cast<T>;
+
     /// Lanewise saturating add.
     ///
     /// # Examples
@@ -77,6 +87,13 @@ macro_rules! impl_trait {
             LaneCount<LANES>: SupportedLaneCount,
         {
             type Scalar = $ty;
+            type Cast<T: SimdElement> = Simd<T, LANES>;
+
+            #[inline]
+            fn cast<T: SimdCast>(self) -> Self::Cast<T> {
+                // Safety: supported types are guaranteed by SimdCast
+                unsafe { intrinsics::simd_as(self) }
+            }
 
             #[inline]
             fn saturating_add(self, second: Self) -> Self {
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 3323b92e37b..10a4c8e86f0 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -1,6 +1,6 @@
 use crate::simd::{
-    intrinsics, LaneCount, Mask, MaskElement, SimdCast, SimdCastPtr, SimdConstPtr, SimdMutPtr,
-    SimdPartialOrd, SupportedLaneCount, Swizzle,
+    intrinsics, LaneCount, Mask, MaskElement, SimdConstPtr, SimdMutPtr, SimdPartialOrd,
+    SupportedLaneCount, Swizzle,
 };
 use core::convert::{TryFrom, TryInto};
 
@@ -297,77 +297,6 @@ where
         unsafe { self.store(slice.as_mut_ptr().cast()) }
     }
 
-    /// Performs elementwise conversion of a SIMD vector's elements to another SIMD-valid type.
-    ///
-    /// This follows the semantics of Rust's `as` conversion for casting integers between
-    /// signed and unsigned (interpreting integers as 2s complement, so `-1` to `U::MAX` and
-    /// `1 << (U::BITS -1)` becoming `I::MIN` ), and from floats to integers (truncating,
-    /// or saturating at the limits) for each element.
-    ///
-    /// # Examples
-    /// ```
-    /// # #![feature(portable_simd)]
-    /// # use core::simd::Simd;
-    /// let floats: Simd<f32, 4> = Simd::from_array([1.9, -4.5, f32::INFINITY, f32::NAN]);
-    /// let ints = floats.cast::<i32>();
-    /// assert_eq!(ints, Simd::from_array([1, -4, i32::MAX, 0]));
-    ///
-    /// // Formally equivalent, but `Simd::cast` can optimize better.
-    /// assert_eq!(ints, Simd::from_array(floats.to_array().map(|x| x as i32)));
-    ///
-    /// // The float conversion does not round-trip.
-    /// let floats_again = ints.cast();
-    /// assert_ne!(floats, floats_again);
-    /// assert_eq!(floats_again, Simd::from_array([1.0, -4.0, 2147483647.0, 0.0]));
-    /// ```
-    #[must_use]
-    #[inline]
-    #[cfg(not(bootstrap))]
-    pub fn cast<U: SimdCast>(self) -> Simd<U, N>
-    where
-        T: SimdCast,
-    {
-        // Safety: supported types are guaranteed by SimdCast
-        unsafe { intrinsics::simd_as(self) }
-    }
-
-    /// Casts a vector of pointers to another pointer type.
-    #[must_use]
-    #[inline]
-    pub fn cast_ptr<U>(self) -> Simd<U, N>
-    where
-        T: SimdCastPtr<U>,
-        U: SimdElement,
-    {
-        // Safety: supported types are guaranteed by SimdCastPtr
-        unsafe { intrinsics::simd_cast_ptr(self) }
-    }
-
-    /// Rounds toward zero and converts to the same-width integer type, assuming that
-    /// the value is finite and fits in that type.
-    ///
-    /// # Safety
-    /// The value must:
-    ///
-    /// * Not be NaN
-    /// * Not be infinite
-    /// * Be representable in the return type, after truncating off its fractional part
-    ///
-    /// If these requirements are infeasible or costly, consider using the safe function [cast],
-    /// which saturates on conversion.
-    ///
-    /// [cast]: Simd::cast
-    #[inline]
-    #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
-    pub unsafe fn to_int_unchecked<I>(self) -> Simd<I, N>
-    where
-        T: core::convert::FloatToInt<I> + SimdCast,
-        I: SimdCast,
-    {
-        // Safety: supported types are guaranteed by SimdCast, the caller is responsible for the extra invariants
-        unsafe { intrinsics::simd_cast(self) }
-    }
-
     /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
     /// If an index is out-of-bounds, the element is instead selected from the `or` vector.
     ///
diff --git a/crates/core_simd/tests/cast.rs b/crates/core_simd/tests/cast.rs
index ab5650f0713..00545936ea2 100644
--- a/crates/core_simd/tests/cast.rs
+++ b/crates/core_simd/tests/cast.rs
@@ -2,7 +2,8 @@
 macro_rules! cast_types {
     ($start:ident, $($target:ident),*) => {
         mod $start {
-            use core_simd::simd::Simd;
+            #[allow(unused)]
+            use core_simd::simd::{Simd, SimdInt, SimdUint, SimdFloat};
             type Vector<const N: usize> = Simd<$start, N>;
             $(
                 mod $target {
diff --git a/crates/core_simd/tests/round.rs b/crates/core_simd/tests/round.rs
index 8b9638ad466..aacf7bd3bcc 100644
--- a/crates/core_simd/tests/round.rs
+++ b/crates/core_simd/tests/round.rs
@@ -53,6 +53,7 @@ macro_rules! float_rounding_test {
 
             test_helpers::test_lanes! {
                 fn to_int_unchecked<const LANES: usize>() {
+                    use core_simd::simd::SimdFloat;
                     // The maximum integer that can be represented by the equivalently sized float has
                     // all of the mantissa digits set to 1, pushed up to the MSB.
                     const ALL_MANTISSA_BITS: IntScalar = ((1 << <Scalar>::MANTISSA_DIGITS) - 1);

From f4ee1ab71174329e04af88370948e08a11c668a5 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sat, 20 May 2023 17:28:05 -0400
Subject: [PATCH 26/35] Simplify to_int_unchecked

---
 crates/core_simd/src/elements/float.rs | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/crates/core_simd/src/elements/float.rs b/crates/core_simd/src/elements/float.rs
index f6f6f51de53..70b18eee0c0 100644
--- a/crates/core_simd/src/elements/float.rs
+++ b/crates/core_simd/src/elements/float.rs
@@ -56,10 +56,9 @@ pub trait SimdFloat: Copy + Sealed {
     /// which saturates on conversion.
     ///
     /// [cast]: Simd::cast
-    unsafe fn to_int_unchecked<I>(self) -> Self::Cast<I>
+    unsafe fn to_int_unchecked<I: SimdCast>(self) -> Self::Cast<I>
     where
-        Self::Scalar: core::convert::FloatToInt<I> + SimdCast,
-        I: SimdCast;
+        Self::Scalar: core::convert::FloatToInt<I>;
 
     /// Raw transmutation to an unsigned integer vector type with the
     /// same size and number of lanes.
@@ -263,10 +262,9 @@ macro_rules! impl_trait {
 
             #[inline]
             #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
-            unsafe fn to_int_unchecked<I>(self) -> Self::Cast<I>
+            unsafe fn to_int_unchecked<I: SimdCast>(self) -> Self::Cast<I>
             where
-                Self::Scalar: core::convert::FloatToInt<I> + SimdCast,
-                I: SimdCast,
+                Self::Scalar: core::convert::FloatToInt<I>,
             {
                 // Safety: supported types are guaranteed by SimdCast, the caller is responsible for the extra invariants
                 unsafe { intrinsics::simd_cast(self) }

From c0b7df522ff64773b783e702719537a61f55288a Mon Sep 17 00:00:00 2001
From: Thom Chiovoloni <thom@shift.click>
Date: Sun, 7 May 2023 07:30:44 -0700
Subject: [PATCH 27/35] Add `#[inline]` to functions which were missing it, and
 `#[track_caller]` to ones with runtime panics from user input

---
 crates/core_simd/src/iter.rs   |  4 ++++
 crates/core_simd/src/lib.rs    |  2 +-
 crates/core_simd/src/masks.rs  |  4 ++++
 crates/core_simd/src/ops.rs    |  8 +++++++-
 crates/core_simd/src/ord.rs    |  4 ++++
 crates/core_simd/src/vector.rs | 18 +++++++++++++++++-
 6 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/crates/core_simd/src/iter.rs b/crates/core_simd/src/iter.rs
index 3275b4db8e4..328c995b81d 100644
--- a/crates/core_simd/src/iter.rs
+++ b/crates/core_simd/src/iter.rs
@@ -10,6 +10,7 @@ macro_rules! impl_traits {
         where
             LaneCount<LANES>: SupportedLaneCount,
         {
+            #[inline]
             fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
                 iter.fold(Simd::splat(0 as $type), Add::add)
             }
@@ -19,6 +20,7 @@ macro_rules! impl_traits {
         where
             LaneCount<LANES>: SupportedLaneCount,
         {
+            #[inline]
             fn product<I: Iterator<Item = Self>>(iter: I) -> Self {
                 iter.fold(Simd::splat(1 as $type), Mul::mul)
             }
@@ -28,6 +30,7 @@ macro_rules! impl_traits {
         where
             LaneCount<LANES>: SupportedLaneCount,
         {
+            #[inline]
             fn sum<I: Iterator<Item = &'a Self>>(iter: I) -> Self {
                 iter.fold(Simd::splat(0 as $type), Add::add)
             }
@@ -37,6 +40,7 @@ macro_rules! impl_traits {
         where
             LaneCount<LANES>: SupportedLaneCount,
         {
+            #[inline]
             fn product<I: Iterator<Item = &'a Self>>(iter: I) -> Self {
                 iter.fold(Simd::splat(1 as $type), Mul::mul)
             }
diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index e5307de2155..fde406bda70 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -16,7 +16,7 @@
 )]
 #![cfg_attr(feature = "generic_const_exprs", feature(generic_const_exprs))]
 #![cfg_attr(feature = "generic_const_exprs", allow(incomplete_features))]
-#![warn(missing_docs)]
+#![warn(missing_docs, clippy::missing_inline_in_public_items)] // basically all items, really
 #![deny(unsafe_op_in_unsafe_fn, clippy::undocumented_unsafe_blocks)]
 #![unstable(feature = "portable_simd", issue = "86656")]
 //! Portable SIMD module.
diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index e58df80fca8..e04448a50be 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -179,6 +179,7 @@ where
     /// Panics if any lane is not 0 or -1.
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
+    #[track_caller]
     pub fn from_int(value: Simd<T, LANES>) -> Self {
         assert!(T::valid(value), "all values must be either 0 or -1",);
         // Safety: the validity has been checked
@@ -217,6 +218,7 @@ where
     /// Panics if `lane` is greater than or equal to the number of lanes in the vector.
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
+    #[track_caller]
     pub fn test(&self, lane: usize) -> bool {
         assert!(lane < LANES, "lane index out of range");
         // Safety: the lane index has been checked
@@ -240,6 +242,7 @@ where
     /// # Panics
     /// Panics if `lane` is greater than or equal to the number of lanes in the vector.
     #[inline]
+    #[track_caller]
     pub fn set(&mut self, lane: usize, value: bool) {
         assert!(lane < LANES, "lane index out of range");
         // Safety: the lane index has been checked
@@ -327,6 +330,7 @@ where
     T: MaskElement + fmt::Debug,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_list()
             .entries((0..LANES).map(|lane| self.test(lane)))
diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index 5a077a469d8..6e32eb11daf 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -15,6 +15,7 @@ where
     I: core::slice::SliceIndex<[T]>,
 {
     type Output = I::Output;
+    #[inline]
     fn index(&self, index: I) -> &Self::Output {
         &self.as_array()[index]
     }
@@ -26,6 +27,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
     I: core::slice::SliceIndex<[T]>,
 {
+    #[inline]
     fn index_mut(&mut self, index: I) -> &mut Self::Output {
         &mut self.as_mut_array()[index]
     }
@@ -118,10 +120,14 @@ macro_rules! for_base_types {
 
                     #[inline]
                     #[must_use = "operator returns a new vector without mutating the inputs"]
+                    // TODO: only useful for int Div::div, but we hope that this
+                    // will essentially always always get inlined anyway.
+                    #[track_caller]
                     fn $call(self, rhs: Self) -> Self::Output {
                         $macro_impl!(self, rhs, $inner, $scalar)
                     }
-                })*
+                }
+            )*
     }
 }
 
diff --git a/crates/core_simd/src/ord.rs b/crates/core_simd/src/ord.rs
index 1ae9cd061fb..b2455190e82 100644
--- a/crates/core_simd/src/ord.rs
+++ b/crates/core_simd/src/ord.rs
@@ -94,6 +94,7 @@ macro_rules! impl_integer {
             }
 
             #[inline]
+            #[track_caller]
             fn simd_clamp(self, min: Self, max: Self) -> Self {
                 assert!(
                     min.simd_le(max).all(),
@@ -200,6 +201,7 @@ macro_rules! impl_mask {
             }
 
             #[inline]
+            #[track_caller]
             fn simd_clamp(self, min: Self, max: Self) -> Self {
                 assert!(
                     min.simd_le(max).all(),
@@ -254,6 +256,7 @@ where
     }
 
     #[inline]
+    #[track_caller]
     fn simd_clamp(self, min: Self, max: Self) -> Self {
         assert!(
             min.simd_le(max).all(),
@@ -303,6 +306,7 @@ where
     }
 
     #[inline]
+    #[track_caller]
     fn simd_clamp(self, min: Self, max: Self) -> Self {
         assert!(
             min.simd_le(max).all(),
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 3323b92e37b..475b7533a8b 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -122,6 +122,7 @@ where
     /// let v = u32x4::splat(0);
     /// assert_eq!(v.lanes(), 4);
     /// ```
+    #[inline]
     pub const fn lanes(&self) -> usize {
         Self::LANES
     }
@@ -136,6 +137,7 @@ where
     /// let v = u32x4::splat(8);
     /// assert_eq!(v.as_array(), &[8, 8, 8, 8]);
     /// ```
+    #[inline]
     pub fn splat(value: T) -> Self {
         // This is preferred over `[value; N]`, since it's explicitly a splat:
         // https://github.com/rust-lang/rust/issues/97804
@@ -156,6 +158,7 @@ where
     /// let v: u64x4 = Simd::from_array([0, 1, 2, 3]);
     /// assert_eq!(v.as_array(), &[0, 1, 2, 3]);
     /// ```
+    #[inline]
     pub const fn as_array(&self) -> &[T; N] {
         // SAFETY: `Simd<T, N>` is just an overaligned `[T; N]` with
         // potential padding at the end, so pointer casting to a
@@ -167,6 +170,7 @@ where
     }
 
     /// Returns a mutable array reference containing the entire SIMD vector.
+    #[inline]
     pub fn as_mut_array(&mut self) -> &mut [T; N] {
         // SAFETY: `Simd<T, N>` is just an overaligned `[T; N]` with
         // potential padding at the end, so pointer casting to a
@@ -184,6 +188,7 @@ where
     ///
     /// # Safety
     /// Reading `ptr` must be safe, as if by `<*const [T; N]>::read_unaligned`.
+    #[inline]
     const unsafe fn load(ptr: *const [T; N]) -> Self {
         // There are potentially simpler ways to write this function, but this should result in
         // LLVM `load <N x T>`
@@ -204,6 +209,7 @@ where
     ///
     /// # Safety
     /// Writing to `ptr` must be safe, as if by `<*mut [T; N]>::write_unaligned`.
+    #[inline]
     const unsafe fn store(self, ptr: *mut [T; N]) {
         // There are potentially simpler ways to write this function, but this should result in
         // LLVM `store <N x T>`
@@ -216,6 +222,7 @@ where
     }
 
     /// Converts an array to a SIMD vector.
+    #[inline]
     pub const fn from_array(array: [T; N]) -> Self {
         // SAFETY: `&array` is safe to read.
         //
@@ -228,6 +235,7 @@ where
     }
 
     /// Converts a SIMD vector to an array.
+    #[inline]
     pub const fn to_array(self) -> [T; N] {
         let mut tmp = core::mem::MaybeUninit::uninit();
         // SAFETY: writing to `tmp` is safe and initializes it.
@@ -258,7 +266,8 @@ where
     /// let v = u32x4::from_slice(&source);
     /// assert_eq!(v.as_array(), &[1, 2, 3, 4]);
     /// ```
-    #[must_use]
+    #[inline]
+    #[track_caller]
     pub const fn from_slice(slice: &[T]) -> Self {
         assert!(
             slice.len() >= Self::LANES,
@@ -287,6 +296,8 @@ where
     /// v.copy_to_slice(&mut dest);
     /// assert_eq!(&dest, &[1, 2, 3, 4, 0, 0]);
     /// ```
+    #[inline]
+    #[track_caller]
     pub fn copy_to_slice(self, slice: &mut [T]) {
         assert!(
             slice.len() >= Self::LANES,
@@ -718,6 +729,7 @@ where
     LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
+    #[inline]
     fn clone(&self) -> Self {
         *self
     }
@@ -862,6 +874,7 @@ where
     LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
+    #[inline]
     fn from(array: [T; N]) -> Self {
         Self::from_array(array)
     }
@@ -872,6 +885,7 @@ where
     LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
+    #[inline]
     fn from(vector: Simd<T, N>) -> Self {
         vector.to_array()
     }
@@ -884,6 +898,7 @@ where
 {
     type Error = core::array::TryFromSliceError;
 
+    #[inline]
     fn try_from(slice: &[T]) -> Result<Self, core::array::TryFromSliceError> {
         Ok(Self::from_array(slice.try_into()?))
     }
@@ -896,6 +911,7 @@ where
 {
     type Error = core::array::TryFromSliceError;
 
+    #[inline]
     fn try_from(slice: &mut [T]) -> Result<Self, core::array::TryFromSliceError> {
         Ok(Self::from_array(slice.try_into()?))
     }

From 0315db3cc183994545a29faeff8dbe944257fc91 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 21 May 2023 14:00:48 -0400
Subject: [PATCH 28/35] Re-add missing #[must_use]

---
 crates/core_simd/src/vector.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 475b7533a8b..b1be2a2a114 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -266,6 +266,7 @@ where
     /// let v = u32x4::from_slice(&source);
     /// assert_eq!(v.as_array(), &[1, 2, 3, 4]);
     /// ```
+    #[must_use]
     #[inline]
     #[track_caller]
     pub const fn from_slice(slice: &[T]) -> Self {

From 1af32f0a3a2c34ab9822f96d199d0d9bad7c5b66 Mon Sep 17 00:00:00 2001
From: Taiki Endo <te316e89@gmail.com>
Date: Tue, 30 May 2023 23:10:05 +0900
Subject: [PATCH 29/35] Fix build error on big endian arm/aarch64

---
 crates/core_simd/src/swizzle_dyn.rs | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
index 3eb80d5dca1..a4da461d546 100644
--- a/crates/core_simd/src/swizzle_dyn.rs
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -16,9 +16,9 @@ where
     #[inline]
     pub fn swizzle_dyn(self, idxs: Simd<u8, N>) -> Self {
         #![allow(unused_imports, unused_unsafe)]
-        #[cfg(target_arch = "aarch64")]
+        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
         use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8};
-        #[cfg(all(target_arch = "arm", target_feature = "v7"))]
+        #[cfg(all(target_arch = "arm", target_feature = "v7", target_endian = "little"))]
         use core::arch::arm::{uint8x8_t, vtbl1_u8};
         #[cfg(target_arch = "wasm32")]
         use core::arch::wasm32 as wasm;
@@ -29,13 +29,24 @@ where
         // SAFETY: Intrinsics covered by cfg
         unsafe {
             match N {
-                #[cfg(target_feature = "neon")]
+                #[cfg(all(
+                    any(
+                        target_arch = "aarch64",
+                        all(target_arch = "arm", target_feature = "v7")
+                    ),
+                    target_feature = "neon",
+                    target_endian = "little"
+                ))]
                 8 => transize(vtbl1_u8, self, idxs),
                 #[cfg(target_feature = "ssse3")]
                 16 => transize(x86::_mm_shuffle_epi8, self, idxs),
                 #[cfg(target_feature = "simd128")]
                 16 => transize(wasm::i8x16_swizzle, self, idxs),
-                #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+                #[cfg(all(
+                    target_arch = "aarch64",
+                    target_feature = "neon",
+                    target_endian = "little"
+                ))]
                 16 => transize(vqtbl1q_u8, self, idxs),
                 #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
                 32 => transize_raw(avx2_pshufb, self, idxs),

From ed2ee7ace1534b05f53eb5cc24784dbbb4912214 Mon Sep 17 00:00:00 2001
From: Weihang Lo <me@weihanglo.tw>
Date: Tue, 30 May 2023 23:58:20 +0100
Subject: [PATCH 30/35] Explicit set `workspace.resolver = "1"`

rust-lang/cargo#10910 starts emitting warning if resolver is not set
for 2021 edition package. We want to surpress the warning for now.
---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 9802386e456..d1732aaec2f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,5 @@
 [workspace]
-
+resolver = "1"
 members = [
     "crates/core_simd",
     "crates/std_float",

From eb0041d15457a6b598196444ef9a852c7de42356 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Wed, 7 Jun 2023 23:46:06 -0400
Subject: [PATCH 31/35] Format

---
 crates/core_simd/src/swizzle_dyn.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
index d97a4c90b9c..ce621792534 100644
--- a/crates/core_simd/src/swizzle_dyn.rs
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -18,7 +18,12 @@ where
         #![allow(unused_imports, unused_unsafe)]
         #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
         use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8};
-        #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon", target_endian = "little"))]
+        #[cfg(all(
+            target_arch = "arm",
+            target_feature = "v7",
+            target_feature = "neon",
+            target_endian = "little"
+        ))]
         use core::arch::arm::{uint8x8_t, vtbl1_u8};
         #[cfg(target_arch = "wasm32")]
         use core::arch::wasm32 as wasm;

From f2f9bd7eb178bb19ba2f935903cf4de95b3952f5 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Fri, 7 Jul 2023 03:32:29 -0700
Subject: [PATCH 32/35] Disable MIPS jobs in CI

---
 .github/workflows/ci.yml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index acd47a3da72..1ff377fce34 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -38,8 +38,9 @@ jobs:
           - i586-unknown-linux-gnu
           - aarch64-unknown-linux-gnu
           - armv7-unknown-linux-gnueabihf
-          - mips-unknown-linux-gnu
-          - mips64-unknown-linux-gnuabi64
+          # non-nightly since https://github.com/rust-lang/rust/pull/113274
+          # - mips-unknown-linux-gnu
+          # - mips64-unknown-linux-gnuabi64
           - powerpc-unknown-linux-gnu
           - powerpc64-unknown-linux-gnu
           - riscv64gc-unknown-linux-gnu
@@ -191,8 +192,8 @@ jobs:
           # Note: The issue above means neither of these mips targets will use
           # MSA (mips simd) but MIPS uses a nonstandard binary representation
           # for NaNs which makes it worth testing on despite that.
-          - mips-unknown-linux-gnu
-          - mips64-unknown-linux-gnuabi64
+          # - mips-unknown-linux-gnu
+          # - mips64-unknown-linux-gnuabi64
           - riscv64gc-unknown-linux-gnu
           # TODO this test works, but it appears to time out
           # - powerpc-unknown-linux-gnu

From 789c38fae2f81985dac3c5181bc888a363ac781e Mon Sep 17 00:00:00 2001
From: Jubilee <46493976+workingjubilee@users.noreply.github.com>
Date: Fri, 7 Jul 2023 03:49:42 -0700
Subject: [PATCH 33/35] Fixed cast imports in doctest
 (rust-lang/portable-simd#355)

---
 crates/core_simd/src/elements/float.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/crates/core_simd/src/elements/float.rs b/crates/core_simd/src/elements/float.rs
index 70b18eee0c0..22c5fdcd118 100644
--- a/crates/core_simd/src/elements/float.rs
+++ b/crates/core_simd/src/elements/float.rs
@@ -26,7 +26,9 @@ pub trait SimdFloat: Copy + Sealed {
     /// # Example
     /// ```
     /// # #![feature(portable_simd)]
-    /// # use core::simd::Simd;
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{f32x2, SimdFloat, SimdInt, Simd};
     /// let floats: Simd<f32, 4> = Simd::from_array([1.9, -4.5, f32::INFINITY, f32::NAN]);
     /// let ints = floats.cast::<i32>();
     /// assert_eq!(ints, Simd::from_array([1, -4, i32::MAX, 0]));

From 7c7dbe0c505ccbc02ff30c1e37381ab1d47bf46f Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Fri, 7 Jul 2023 04:03:54 -0700
Subject: [PATCH 34/35] Remove unused import

---
 crates/core_simd/src/elements/float.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/elements/float.rs b/crates/core_simd/src/elements/float.rs
index 22c5fdcd118..501c1c5ddd3 100644
--- a/crates/core_simd/src/elements/float.rs
+++ b/crates/core_simd/src/elements/float.rs
@@ -28,7 +28,7 @@ pub trait SimdFloat: Copy + Sealed {
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{f32x2, SimdFloat, SimdInt, Simd};
+    /// # use simd::{SimdFloat, SimdInt, Simd};
     /// let floats: Simd<f32, 4> = Simd::from_array([1.9, -4.5, f32::INFINITY, f32::NAN]);
     /// let ints = floats.cast::<i32>();
     /// assert_eq!(ints, Simd::from_array([1, -4, i32::MAX, 0]));

From 37fea342ea26bb7042a6acdf683c8210eb2172ff Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Fri, 7 Jul 2023 04:33:08 -0700
Subject: [PATCH 35/35] Use new std::simd fn in miri tests

Old fn were slightly divergent.
---
 src/tools/miri/tests/pass/portable-simd-ptrs.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tools/miri/tests/pass/portable-simd-ptrs.rs b/src/tools/miri/tests/pass/portable-simd-ptrs.rs
index 303c99834f5..843bb0284cf 100644
--- a/src/tools/miri/tests/pass/portable-simd-ptrs.rs
+++ b/src/tools/miri/tests/pass/portable-simd-ptrs.rs
@@ -6,7 +6,7 @@ use std::simd::*;
 
 fn main() {
     // Pointer casts
-    let _val: Simd<*const u8, 4> = Simd::<*const i32, 4>::splat(ptr::null()).cast_ptr();
+    let _val: Simd<*const u8, 4> = Simd::<*const i32, 4>::splat(ptr::null()).cast();
     let addrs = Simd::<*const i32, 4>::splat(ptr::null()).expose_addr();
     let _ptrs = Simd::<*const i32, 4>::from_exposed_addr(addrs);
 }