1
Fork 0

Stop manually SIMDing in swap_nonoverlapping

Like I previously did for `reverse`, this leaves it to LLVM to pick how to vectorize it, since it can know better the chunk size to use, compared to the "32 bytes always" approach we currently have.

It does still need logic to type-erase where appropriate, though, as while LLVM is now smart enough to vectorize over slices of things like `[u8; 4]`, it fails to do so over slices of `[u8; 3]`.

As a bonus, this also means one no longer gets the spurious `memcpy`(s?) at the end up swapping a slice of `__m256`s: <https://rust.godbolt.org/z/joofr4v8Y>
This commit is contained in:
Scott McMurray 2022-02-20 23:25:18 -08:00
parent 73a7423e77
commit 8ca47d7ae4
6 changed files with 263 additions and 97 deletions

View file

@ -89,6 +89,15 @@ fn binary_search_l3_worst_case(b: &mut Bencher) {
binary_search_worst_case(b, Cache::L3);
}
#[derive(Clone)]
struct Rgb(u8, u8, u8);
impl Rgb {
fn gen(i: usize) -> Self {
Rgb(i as u8, (i as u8).wrapping_add(7), (i as u8).wrapping_add(42))
}
}
macro_rules! rotate {
($fn:ident, $n:expr, $mapper:expr) => {
#[bench]
@ -104,17 +113,43 @@ macro_rules! rotate {
};
}
#[derive(Clone)]
struct Rgb(u8, u8, u8);
rotate!(rotate_u8, 32, |i| i as u8);
rotate!(rotate_rgb, 32, |i| Rgb(i as u8, (i as u8).wrapping_add(7), (i as u8).wrapping_add(42)));
rotate!(rotate_rgb, 32, Rgb::gen);
rotate!(rotate_usize, 32, |i| i);
rotate!(rotate_16_usize_4, 16, |i| [i; 4]);
rotate!(rotate_16_usize_5, 16, |i| [i; 5]);
rotate!(rotate_64_usize_4, 64, |i| [i; 4]);
rotate!(rotate_64_usize_5, 64, |i| [i; 5]);
macro_rules! swap_with_slice {
($fn:ident, $n:expr, $mapper:expr) => {
#[bench]
fn $fn(b: &mut Bencher) {
let mut x = (0usize..$n).map(&$mapper).collect::<Vec<_>>();
let mut y = ($n..($n * 2)).map(&$mapper).collect::<Vec<_>>();
let mut skip = 0;
b.iter(|| {
for _ in 0..32 {
x[skip..].swap_with_slice(&mut y[..($n - skip)]);
skip = black_box(skip + 1) % 8;
}
black_box((x[$n / 3].clone(), y[$n * 2 / 3].clone()))
})
}
};
}
swap_with_slice!(swap_with_slice_u8_30, 30, |i| i as u8);
swap_with_slice!(swap_with_slice_u8_3000, 3000, |i| i as u8);
swap_with_slice!(swap_with_slice_rgb_30, 30, Rgb::gen);
swap_with_slice!(swap_with_slice_rgb_3000, 3000, Rgb::gen);
swap_with_slice!(swap_with_slice_usize_30, 30, |i| i);
swap_with_slice!(swap_with_slice_usize_3000, 3000, |i| i);
swap_with_slice!(swap_with_slice_4x_usize_30, 30, |i| [i; 4]);
swap_with_slice!(swap_with_slice_4x_usize_3000, 3000, |i| [i; 4]);
swap_with_slice!(swap_with_slice_5x_usize_30, 30, |i| [i; 5]);
swap_with_slice!(swap_with_slice_5x_usize_3000, 3000, |i| [i; 5]);
#[bench]
fn fill_byte_sized(b: &mut Bencher) {
#[derive(Copy, Clone)]