Auto merge of #107634 - scottmcm:array-drain, r=thomcc
Improve the `array::map` codegen
The `map` method on arrays [is documented as sometimes performing poorly](https://doc.rust-lang.org/std/primitive.array.html#note-on-performance-and-stack-usage), and after [a question on URLO](https://users.rust-lang.org/t/try-trait-residual-o-trait-and-try-collect-into-array/88510?u=scottmcm) prompted me to take another look at the core [`try_collect_into_array`](7c46fb2111/library/core/src/array/mod.rs (L865-L912)
) function, I had some ideas that ended up working better than I'd expected.
There's three main ideas in here, split over three commits:
1. Don't use `array::IntoIter` when we can avoid it, since that seems to not get SRoA'd, meaning that every step writes things like loop counters into the stack unnecessarily
2. Don't return arrays in `Result`s unnecessarily, as that doesn't seem to optimize away even with `unwrap_unchecked` (perhaps because it needs to get moved into a new LLVM type to account for the discriminant)
3. Don't distract LLVM with all the `Option` dances when we know for sure we have enough items (like in `map` and `zip`). This one's a larger commit as to do it I ended up adding a new `pub(crate)` trait, but hopefully those changes are still straight-forward.
(No libs-api changes; everything should be completely implementation-detail-internal.)
It's still not completely fixed -- I think it needs pcwalton's `memcpy` optimizations still (#103830) to get further -- but this seems to go much better than before. And the remaining `memcpy`s are just `transmute`-equivalent (`[T; N] -> ManuallyDrop<[T; N]>` and `[MaybeUninit<T>; N] -> [T; N]`), so hopefully those will be easier to remove with LLVM16 than the previous subobject copies 🤞
r? `@thomcc`
As a simple example, this test
```rust
pub fn long_integer_map(x: [u32; 64]) -> [u32; 64] {
x.map(|x| 13 * x + 7)
}
```
On nightly <https://rust.godbolt.org/z/xK7548TGj> takes `sub rsp, 808`
```llvm
start:
%array.i.i.i.i = alloca [64 x i32], align 4
%_3.sroa.5.i.i.i = alloca [65 x i32], align 4
%_5.i = alloca %"core::iter::adapters::map::Map<core::array::iter::IntoIter<u32, 64>, [closure@/app/example.rs:2:11: 2:14]>", align 8
```
(and yes, that's a 6**5**-element array `alloca` despite 6**4**-element input and output)
But with this PR it's only `sub rsp, 520`
```llvm
start:
%array.i.i.i.i.i.i = alloca [64 x i32], align 4
%array1.i.i.i = alloca %"core::mem::manually_drop::ManuallyDrop<[u32; 64]>", align 4
```
Similarly, the loop it emits on nightly is scalar-only and horrifying
```nasm
.LBB0_1:
mov esi, 64
mov edi, 0
cmp rdx, 64
je .LBB0_3
lea rsi, [rdx + 1]
mov qword ptr [rsp + 784], rsi
mov r8d, dword ptr [rsp + 4*rdx + 528]
mov edi, 1
lea edx, [r8 + 2*r8]
lea r8d, [r8 + 4*rdx]
add r8d, 7
.LBB0_3:
test edi, edi
je .LBB0_11
mov dword ptr [rsp + 4*rcx + 272], r8d
cmp rsi, 64
jne .LBB0_6
xor r8d, r8d
mov edx, 64
test r8d, r8d
jne .LBB0_8
jmp .LBB0_11
.LBB0_6:
lea rdx, [rsi + 1]
mov qword ptr [rsp + 784], rdx
mov edi, dword ptr [rsp + 4*rsi + 528]
mov r8d, 1
lea esi, [rdi + 2*rdi]
lea edi, [rdi + 4*rsi]
add edi, 7
test r8d, r8d
je .LBB0_11
.LBB0_8:
mov dword ptr [rsp + 4*rcx + 276], edi
add rcx, 2
cmp rcx, 64
jne .LBB0_1
```
whereas with this PR it's unrolled and vectorized
```nasm
vpmulld ymm1, ymm0, ymmword ptr [rsp + 64]
vpaddd ymm1, ymm1, ymm2
vmovdqu ymmword ptr [rsp + 328], ymm1
vpmulld ymm1, ymm0, ymmword ptr [rsp + 96]
vpaddd ymm1, ymm1, ymm2
vmovdqu ymmword ptr [rsp + 360], ymm1
```
(though sadly still stack-to-stack)
This commit is contained in:
commit
2d91939bb7
16 changed files with 404 additions and 151 deletions
49
tests/codegen/array-map.rs
Normal file
49
tests/codegen/array-map.rs
Normal file
|
@ -0,0 +1,49 @@
|
|||
// compile-flags: -C opt-level=3 -C target-cpu=x86-64-v3
|
||||
// no-system-llvm
|
||||
// only-x86_64
|
||||
// ignore-debug (the extra assertions get in the way)
|
||||
|
||||
#![crate_type = "lib"]
|
||||
#![feature(array_zip)]
|
||||
|
||||
// CHECK-LABEL: @short_integer_map
|
||||
#[no_mangle]
|
||||
pub fn short_integer_map(x: [u32; 8]) -> [u32; 8] {
|
||||
// CHECK: load <8 x i32>
|
||||
// CHECK: shl <8 x i32>
|
||||
// CHECK: or <8 x i32>
|
||||
// CHECK: store <8 x i32>
|
||||
x.map(|x| 2 * x + 1)
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @short_integer_zip_map
|
||||
#[no_mangle]
|
||||
pub fn short_integer_zip_map(x: [u32; 8], y: [u32; 8]) -> [u32; 8] {
|
||||
// CHECK: %[[A:.+]] = load <8 x i32>
|
||||
// CHECK: %[[B:.+]] = load <8 x i32>
|
||||
// CHECK: sub <8 x i32> %[[A]], %[[B]]
|
||||
// CHECK: store <8 x i32>
|
||||
x.zip(y).map(|(x, y)| x - y)
|
||||
}
|
||||
|
||||
// This test is checking that LLVM can SRoA away a bunch of the overhead,
|
||||
// like fully moving the iterators to registers. Notably, previous implementations
|
||||
// of `map` ended up `alloca`ing the whole `array::IntoIterator`, meaning both a
|
||||
// hard-to-eliminate `memcpy` and that the iteration counts needed to be written
|
||||
// out to stack every iteration, even for infallible operations on `Copy` types.
|
||||
//
|
||||
// This is still imperfect, as there's more copies than would be ideal,
|
||||
// but hopefully work like #103830 will improve that in future,
|
||||
// and update this test to be stricter.
|
||||
//
|
||||
// CHECK-LABEL: @long_integer_map
|
||||
#[no_mangle]
|
||||
pub fn long_integer_map(x: [u32; 64]) -> [u32; 64] {
|
||||
// CHECK: start:
|
||||
// CHECK-NEXT: alloca [64 x i32]
|
||||
// CHECK-NEXT: alloca %"core::mem::manually_drop::ManuallyDrop<[u32; 64]>"
|
||||
// CHECK-NOT: alloca
|
||||
// CHECK: mul <{{[0-9]+}} x i32>
|
||||
// CHECK: add <{{[0-9]+}} x i32>
|
||||
x.map(|x| 13 * x + 7)
|
||||
}
|
|
@ -1,6 +1,7 @@
|
|||
// compile-flags: -C opt-level=3
|
||||
// compile-flags: -C opt-level=3 -Z merge-functions=disabled
|
||||
// only-x86_64
|
||||
#![crate_type = "lib"]
|
||||
#![feature(array_zip)]
|
||||
|
||||
// CHECK-LABEL: @auto_vectorize_direct
|
||||
#[no_mangle]
|
||||
|
@ -30,3 +31,13 @@ pub fn auto_vectorize_loop(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
|
|||
}
|
||||
c
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @auto_vectorize_array_zip_map
|
||||
#[no_mangle]
|
||||
pub fn auto_vectorize_array_zip_map(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
|
||||
// CHECK: load <4 x float>
|
||||
// CHECK: load <4 x float>
|
||||
// CHECK: fadd <4 x float>
|
||||
// CHECK: store <4 x float>
|
||||
a.zip(b).map(|(a, b)| a + b)
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue