For MIRI, cfg out the swap logic from 94212
This commit is contained in:
parent
10cc7a6d03
commit
b582bd388f
3 changed files with 64 additions and 13 deletions
|
@ -708,7 +708,10 @@ pub const fn swap<T>(x: &mut T, y: &mut T) {
|
||||||
// understanding `mem::replace`, `Option::take`, etc. - a better overall
|
// understanding `mem::replace`, `Option::take`, etc. - a better overall
|
||||||
// solution might be to make `ptr::swap_nonoverlapping` into an intrinsic, which
|
// solution might be to make `ptr::swap_nonoverlapping` into an intrinsic, which
|
||||||
// a backend can choose to implement using the block optimization, or not.
|
// a backend can choose to implement using the block optimization, or not.
|
||||||
#[cfg(not(target_arch = "spirv"))]
|
// NOTE(scottmcm) MIRI is disabled here as reading in smaller units is a
|
||||||
|
// pessimization for it. Also, if the type contains any unaligned pointers,
|
||||||
|
// copying those over multiple reads is difficult to support.
|
||||||
|
#[cfg(not(any(target_arch = "spirv", miri)))]
|
||||||
{
|
{
|
||||||
// For types that are larger multiples of their alignment, the simple way
|
// For types that are larger multiples of their alignment, the simple way
|
||||||
// tends to copy the whole thing to stack rather than doing it one part
|
// tends to copy the whole thing to stack rather than doing it one part
|
||||||
|
@ -737,12 +740,26 @@ pub const fn swap<T>(x: &mut T, y: &mut T) {
|
||||||
#[rustc_const_unstable(feature = "const_swap", issue = "83163")]
|
#[rustc_const_unstable(feature = "const_swap", issue = "83163")]
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(crate) const fn swap_simple<T>(x: &mut T, y: &mut T) {
|
pub(crate) const fn swap_simple<T>(x: &mut T, y: &mut T) {
|
||||||
|
// We arrange for this to typically be called with small types,
|
||||||
|
// so this reads-and-writes approach is actually better than using
|
||||||
|
// copy_nonoverlapping as it easily puts things in LLVM registers
|
||||||
|
// directly and doesn't end up inlining allocas.
|
||||||
|
// And LLVM actually optimizes it to 3×memcpy if called with
|
||||||
|
// a type larger than it's willing to keep in a register.
|
||||||
|
// Having typed reads and writes in MIR here is also good as
|
||||||
|
// it lets MIRI and CTFE understand them better, including things
|
||||||
|
// like enforcing type validity for them.
|
||||||
|
// Importantly, read+copy_nonoverlapping+write introduces confusing
|
||||||
|
// asymmetry to the behaviour where one value went through read+write
|
||||||
|
// whereas the other was copied over by the intrinsic (see #94371).
|
||||||
|
|
||||||
// SAFETY: exclusive references are always valid to read/write,
|
// SAFETY: exclusive references are always valid to read/write,
|
||||||
// are non-overlapping, and nothing here panics so it's drop-safe.
|
// including being aligned, and nothing here panics so it's drop-safe.
|
||||||
unsafe {
|
unsafe {
|
||||||
let z = ptr::read(x);
|
let a = ptr::read(x);
|
||||||
ptr::copy_nonoverlapping(y, x, 1);
|
let b = ptr::read(y);
|
||||||
ptr::write(y, z);
|
ptr::write(x, b);
|
||||||
|
ptr::write(y, a);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -419,6 +419,7 @@ pub const unsafe fn swap<T>(x: *mut T, y: *mut T) {
|
||||||
#[stable(feature = "swap_nonoverlapping", since = "1.27.0")]
|
#[stable(feature = "swap_nonoverlapping", since = "1.27.0")]
|
||||||
#[rustc_const_unstable(feature = "const_swap", issue = "83163")]
|
#[rustc_const_unstable(feature = "const_swap", issue = "83163")]
|
||||||
pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
|
pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
|
||||||
|
#[allow(unused)]
|
||||||
macro_rules! attempt_swap_as_chunks {
|
macro_rules! attempt_swap_as_chunks {
|
||||||
($ChunkTy:ty) => {
|
($ChunkTy:ty) => {
|
||||||
if mem::align_of::<T>() >= mem::align_of::<$ChunkTy>()
|
if mem::align_of::<T>() >= mem::align_of::<$ChunkTy>()
|
||||||
|
@ -437,15 +438,21 @@ pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Split up the slice into small power-of-two-sized chunks that LLVM is able
|
// NOTE(scottmcm) MIRI is disabled here as reading in smaller units is a
|
||||||
// to vectorize (unless it's a special type with more-than-pointer alignment,
|
// pessimization for it. Also, if the type contains any unaligned pointers,
|
||||||
// because we don't want to pessimize things like slices of SIMD vectors.)
|
// copying those over multiple reads is difficult to support.
|
||||||
if mem::align_of::<T>() <= mem::size_of::<usize>()
|
#[cfg(not(miri))]
|
||||||
&& (!mem::size_of::<T>().is_power_of_two()
|
|
||||||
|| mem::size_of::<T>() > mem::size_of::<usize>() * 2)
|
|
||||||
{
|
{
|
||||||
attempt_swap_as_chunks!(usize);
|
// Split up the slice into small power-of-two-sized chunks that LLVM is able
|
||||||
attempt_swap_as_chunks!(u8);
|
// to vectorize (unless it's a special type with more-than-pointer alignment,
|
||||||
|
// because we don't want to pessimize things like slices of SIMD vectors.)
|
||||||
|
if mem::align_of::<T>() <= mem::size_of::<usize>()
|
||||||
|
&& (!mem::size_of::<T>().is_power_of_two()
|
||||||
|
|| mem::size_of::<T>() > mem::size_of::<usize>() * 2)
|
||||||
|
{
|
||||||
|
attempt_swap_as_chunks!(usize);
|
||||||
|
attempt_swap_as_chunks!(u8);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// SAFETY: Same preconditions as this function
|
// SAFETY: Same preconditions as this function
|
||||||
|
|
|
@ -39,6 +39,9 @@ pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
|
||||||
swap(x, y)
|
swap(x, y)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Verify that types with usize alignment are swapped via vectored usizes,
|
||||||
|
// not falling back to byte-level code.
|
||||||
|
|
||||||
// CHECK-LABEL: @swap_slice
|
// CHECK-LABEL: @swap_slice
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
|
pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
|
||||||
|
@ -50,6 +53,8 @@ pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// But for a large align-1 type, vectorized byte copying is what we want.
|
||||||
|
|
||||||
type OneKilobyteBuffer = [u8; 1024];
|
type OneKilobyteBuffer = [u8; 1024];
|
||||||
|
|
||||||
// CHECK-LABEL: @swap_1kb_slices
|
// CHECK-LABEL: @swap_1kb_slices
|
||||||
|
@ -62,3 +67,25 @@ pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer])
|
||||||
x.swap_with_slice(y);
|
x.swap_with_slice(y);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This verifies that the 2×read + 2×write optimizes to just 3 memcpys
|
||||||
|
// for an unusual type like this. It's not clear whether we should do anything
|
||||||
|
// smarter in Rust for these, so for now it's fine to leave these up to the backend.
|
||||||
|
// That's not as bad as it might seem, as for example, LLVM will lower the
|
||||||
|
// memcpys below to VMOVAPS on YMMs if one enables the AVX target feature.
|
||||||
|
// Eventually we'll be able to pass `align_of::<T>` to a const generic and
|
||||||
|
// thus pick a smarter chunk size ourselves without huge code duplication.
|
||||||
|
|
||||||
|
#[repr(align(64))]
|
||||||
|
pub struct BigButHighlyAligned([u8; 64 * 3]);
|
||||||
|
|
||||||
|
// CHECK-LABEL: @swap_big_aligned
|
||||||
|
#[no_mangle]
|
||||||
|
pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) {
|
||||||
|
// CHECK-NOT: call void @llvm.memcpy
|
||||||
|
// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192)
|
||||||
|
// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192)
|
||||||
|
// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192)
|
||||||
|
// CHECK-NOT: call void @llvm.memcpy
|
||||||
|
swap(x, y)
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue