Auto merge of #107634 - scottmcm:array-drain, r=thomcc

Improve the `array::map` codegen The `map` method on arrays [is documented as sometimes performing poorly](https://doc.rust-lang.org/std/primitive.array.html#note-on-performance-and-stack-usage), and after [a question on URLO](https://users.rust-lang.org/t/try-trait-residual-o-trait-and-try-collect-into-array/88510?u=scottmcm) prompted me to take another look at the core [`try_collect_into_array`](7c46fb2111/library/core/src/array/mod.rs (L865-L912)) function, I had some ideas that ended up working better than I'd expected. There's three main ideas in here, split over three commits: 1. Don't use `array::IntoIter` when we can avoid it, since that seems to not get SRoA'd, meaning that every step writes things like loop counters into the stack unnecessarily 2. Don't return arrays in `Result`s unnecessarily, as that doesn't seem to optimize away even with `unwrap_unchecked` (perhaps because it needs to get moved into a new LLVM type to account for the discriminant) 3. Don't distract LLVM with all the `Option` dances when we know for sure we have enough items (like in `map` and `zip`). This one's a larger commit as to do it I ended up adding a new `pub(crate)` trait, but hopefully those changes are still straight-forward. (No libs-api changes; everything should be completely implementation-detail-internal.) It's still not completely fixed -- I think it needs pcwalton's `memcpy` optimizations still (#103830) to get further -- but this seems to go much better than before. And the remaining `memcpy`s are just `transmute`-equivalent (`[T; N] -> ManuallyDrop<[T; N]>` and `[MaybeUninit<T>; N] -> [T; N]`), so hopefully those will be easier to remove with LLVM16 than the previous subobject copies 🤞 r? `@thomcc` As a simple example, this test ```rust pub fn long_integer_map(x: [u32; 64]) -> [u32; 64] { x.map(|x| 13 * x + 7) } ``` On nightly <https://rust.godbolt.org/z/xK7548TGj> takes `sub rsp, 808` ```llvm start: %array.i.i.i.i = alloca [64 x i32], align 4 %_3.sroa.5.i.i.i = alloca [65 x i32], align 4 %_5.i = alloca %"core::iter::adapters::map::Map<core::array::iter::IntoIter<u32, 64>, [closure@/app/example.rs:2:11: 2:14]>", align 8 ``` (and yes, that's a 6**5**-element array `alloca` despite 6**4**-element input and output) But with this PR it's only `sub rsp, 520` ```llvm start: %array.i.i.i.i.i.i = alloca [64 x i32], align 4 %array1.i.i.i = alloca %"core::mem::manually_drop::ManuallyDrop<[u32; 64]>", align 4 ``` Similarly, the loop it emits on nightly is scalar-only and horrifying ```nasm .LBB0_1: mov esi, 64 mov edi, 0 cmp rdx, 64 je .LBB0_3 lea rsi, [rdx + 1] mov qword ptr [rsp + 784], rsi mov r8d, dword ptr [rsp + 4*rdx + 528] mov edi, 1 lea edx, [r8 + 2*r8] lea r8d, [r8 + 4*rdx] add r8d, 7 .LBB0_3: test edi, edi je .LBB0_11 mov dword ptr [rsp + 4*rcx + 272], r8d cmp rsi, 64 jne .LBB0_6 xor r8d, r8d mov edx, 64 test r8d, r8d jne .LBB0_8 jmp .LBB0_11 .LBB0_6: lea rdx, [rsi + 1] mov qword ptr [rsp + 784], rdx mov edi, dword ptr [rsp + 4*rsi + 528] mov r8d, 1 lea esi, [rdi + 2*rdi] lea edi, [rdi + 4*rsi] add edi, 7 test r8d, r8d je .LBB0_11 .LBB0_8: mov dword ptr [rsp + 4*rcx + 276], edi add rcx, 2 cmp rcx, 64 jne .LBB0_1 ``` whereas with this PR it's unrolled and vectorized ```nasm vpmulld ymm1, ymm0, ymmword ptr [rsp + 64] vpaddd ymm1, ymm1, ymm2 vmovdqu ymmword ptr [rsp + 328], ymm1 vpmulld ymm1, ymm0, ymmword ptr [rsp + 96] vpaddd ymm1, ymm1, ymm2 vmovdqu ymmword ptr [rsp + 360], ymm1 ``` (though sadly still stack-to-stack)
2023-02-13 10:18:48 +00:00 · 2023-02-13 10:18:48 +00:00 · 2d91939bb7
commit 2d91939bb7
parent 20081880ad bb77860d9c
16 changed files with 404 additions and 151 deletions
--- a/tests/codegen/array-map.rs
+++ b/tests/codegen/array-map.rs
@ -0,0 +1,49 @@
+// compile-flags: -C opt-level=3 -C target-cpu=x86-64-v3
+// no-system-llvm
+// only-x86_64
+// ignore-debug (the extra assertions get in the way)
+
+#![crate_type = "lib"]
+#![feature(array_zip)]
+
+// CHECK-LABEL: @short_integer_map
+#[no_mangle]
+pub fn short_integer_map(x: [u32; 8]) -> [u32; 8] {
+    // CHECK: load <8 x i32>
+    // CHECK: shl <8 x i32>
+    // CHECK: or <8 x i32>
+    // CHECK: store <8 x i32>
+    x.map(|x| 2 * x + 1)
+}
+
+// CHECK-LABEL: @short_integer_zip_map
+#[no_mangle]
+pub fn short_integer_zip_map(x: [u32; 8], y: [u32; 8]) -> [u32; 8] {
+    // CHECK: %[[A:.+]] = load <8 x i32>
+    // CHECK: %[[B:.+]] = load <8 x i32>
+    // CHECK: sub <8 x i32> %[[A]], %[[B]]
+    // CHECK: store <8 x i32>
+    x.zip(y).map(|(x, y)| x - y)
+}
+
+// This test is checking that LLVM can SRoA away a bunch of the overhead,
+// like fully moving the iterators to registers.  Notably, previous implementations
+// of `map` ended up `alloca`ing the whole `array::IntoIterator`, meaning both a
+// hard-to-eliminate `memcpy` and that the iteration counts needed to be written
+// out to stack every iteration, even for infallible operations on `Copy` types.
+//
+// This is still imperfect, as there's more copies than would be ideal,
+// but hopefully work like #103830 will improve that in future,
+// and update this test to be stricter.
+//
+// CHECK-LABEL: @long_integer_map
+#[no_mangle]
+pub fn long_integer_map(x: [u32; 64]) -> [u32; 64] {
+    // CHECK: start:
+    // CHECK-NEXT: alloca [64 x i32]
+    // CHECK-NEXT: alloca %"core::mem::manually_drop::ManuallyDrop<[u32; 64]>"
+    // CHECK-NOT: alloca
+    // CHECK: mul <{{[0-9]+}} x i32>
+    // CHECK: add <{{[0-9]+}} x i32>
+    x.map(|x| 13 * x + 7)
+}
--- a/tests/codegen/autovectorize-f32x4.rs
+++ b/tests/codegen/autovectorize-f32x4.rs
@ -1,6 +1,7 @@
-// compile-flags: -C opt-level=3
+// compile-flags: -C opt-level=3 -Z merge-functions=disabled
 // only-x86_64
 #![crate_type = "lib"]
+#![feature(array_zip)]

 // CHECK-LABEL: @auto_vectorize_direct
 #[no_mangle]
@ -30,3 +31,13 @@ pub fn auto_vectorize_loop(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
    }
    c
 }
+
+// CHECK-LABEL: @auto_vectorize_array_zip_map
+#[no_mangle]
+pub fn auto_vectorize_array_zip_map(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
+// CHECK: load <4 x float>
+// CHECK: load <4 x float>
+// CHECK: fadd <4 x float>
+// CHECK: store <4 x float>
+    a.zip(b).map(|(a, b)| a + b)
+}