Auto merge of #106343 - the8472:slice-iter-fold, r=scottmcm

optimize slice::Iter::fold Fixes 2 of 4 cases from #106288 ``` OLD: test slice::fold_to_last ... bench: 248 ns/iter (+/- 3) NEW: test slice::fold_to_last ... bench: 0 ns/iter (+/- 0) ```
2023-06-15 09:38:53 +00:00 · 2023-06-15 09:38:53 +00:00 · 4996b56ba9
commit 4996b56ba9
parent 5a65be8152 d90508f761
4 changed files with 56 additions and 8 deletions
--- a/library/core/src/slice/iter/macros.rs
+++ b/library/core/src/slice/iter/macros.rs
@ -191,6 +191,39 @@ macro_rules! iterator {
                self.next_back()
            }

+            #[inline]
+            fn fold<B, F>(self, init: B, mut f: F) -> B
+                where
+                    F: FnMut(B, Self::Item) -> B,
+            {
+                // this implementation consists of the following optimizations compared to the
+                // default implementation:
+                // - do-while loop, as is llvm's preferred loop shape,
+                //   see https://releases.llvm.org/16.0.0/docs/LoopTerminology.html#more-canonical-loops
+                // - bumps an index instead of a pointer since the latter case inhibits
+                //   some optimizations, see #111603
+                // - avoids Option wrapping/matching
+                if is_empty!(self) {
+                    return init;
+                }
+                let mut acc = init;
+                let mut i = 0;
+                let len = len!(self);
+                loop {
+                    // SAFETY: the loop iterates `i in 0..len`, which always is in bounds of
+                    // the slice allocation
+                    acc = f(acc, unsafe { & $( $mut_ )? *self.ptr.add(i).as_ptr() });
+                    // SAFETY: `i` can't overflow since it'll only reach usize::MAX if the
+                    // slice had that length, in which case we'll break out of the loop
+                    // after the increment
+                    i = unsafe { i.unchecked_add(1) };
+                    if i == len {
+                        break;
+                    }
+                }
+                acc
+            }
+
            // We override the default implementation, which uses `try_fold`,
            // because this simple implementation generates less LLVM IR and is
            // faster to compile.