Lazify SourceFile::lines.

`SourceFile::lines` is a big part of metadata. It's stored in a compressed form (a difference list) to save disk space. Decoding it is a big fraction of compile time for very small crates/programs. This commit introduces a new type `SourceFileLines` which has a `Lines` form and a `Diffs` form. The latter is used when the metadata is first read, and it is only decoded into the `Lines` form when line data is actually needed. This avoids the decoding cost for many files, especially in `std`. It's a performance win of up to 15% for tiny crates/programs where metadata decoding is a high part of compilation costs. A `Lock` is needed because the methods that access lines data (which can trigger decoding) take `&self` rather than `&mut self`. To allow for this, `SourceFile::lines` now takes a `FnMut` that operates on the lines slice rather than returning the lines slice.
2022-05-30 15:59:45 +10:00 · 2022-05-30 15:59:45 +10:00 · 0b81d7cdc6
commit 0b81d7cdc6
parent bef2b7cd1c
8 changed files with 213 additions and 116 deletions
--- a/compiler/rustc_query_impl/src/on_disk_cache.rs
+++ b/compiler/rustc_query_impl/src/on_disk_cache.rs
@ -713,7 +713,7 @@ impl<'a, 'tcx> Decodable<CacheDecoder<'a, 'tcx>> for Span {
        let len = BytePos::decode(decoder);

        let file_lo = decoder.file_index_to_file(file_lo_index);
-        let lo = file_lo.lines[line_lo - 1] + col_lo;
+        let lo = file_lo.lines(|lines| lines[line_lo - 1] + col_lo);
        let hi = lo + len;

        Span::new(lo, hi, ctxt, parent)
--- a/compiler/rustc_query_system/src/ich/impls_syntax.rs
+++ b/compiler/rustc_query_system/src/ich/impls_syntax.rs
@ -5,7 +5,7 @@ use crate::ich::StableHashingContext;

 use rustc_ast as ast;
 use rustc_data_structures::stable_hasher::{HashStable, StableHasher};
-use rustc_span::{BytePos, NormalizedPos, SourceFile};
+use rustc_span::{BytePos, NormalizedPos, SourceFile, SourceFileLines};
 use std::assert_matches::assert_matches;

 use smallvec::SmallVec;
@ -60,7 +60,7 @@ impl<'ctx> rustc_ast::HashStableContext for StableHashingContext<'ctx> {
 impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
    fn hash_stable(&self, hcx: &mut StableHashingContext<'a>, hasher: &mut StableHasher) {
        let SourceFile {
-            name: _, // We hash the smaller name_hash instead of this
+            ref name, // We hash the smaller name_hash instead of this
            name_hash,
            cnum,
            // Do not hash the source as it is not encoded
@ -80,9 +80,16 @@ impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
        src_hash.hash_stable(hcx, hasher);

        // We only hash the relative position within this source_file
-        lines.len().hash_stable(hcx, hasher);
-        for &line in lines.iter() {
-            stable_byte_pos(line, start_pos).hash_stable(hcx, hasher);
+        match &*lines.borrow() {
+            SourceFileLines::Lines { lines } => {
+                lines.len().hash_stable(hcx, hasher);
+                for &line in lines.iter() {
+                    stable_byte_pos(line, start_pos).hash_stable(hcx, hasher);
+                }
+            }
+            SourceFileLines::Diffs { .. } => {
+                panic!("called hash_stable on SourceFileLines::Diffs for {:?}", name);
+            }
        }

        // We only hash the relative position within this source_file
--- a/compiler/rustc_span/src/lib.rs
+++ b/compiler/rustc_span/src/lib.rs
@ -1222,6 +1222,42 @@ impl DebuggerVisualizerFile {
    }
 }

+#[derive(Clone)]
+pub enum SourceFileLines {
+    /// The source file lines, in decoded (random-access) form.
+    Lines { lines: Vec<BytePos> },
+
+    /// The source file lines in difference list form. This matches the form
+    /// used within metadata, which saves space by exploiting the fact that the
+    /// lines list is sorted and individual lines are usually not that long.
+    ///
+    /// We read it directly from metadata and only decode it into `Lines` form
+    /// when necessary. This is a significant performance win, especially for
+    /// small crates where very little of `std`'s metadata is used.
+    Diffs {
+        /// Position of the first line. Note that this is always encoded as a
+        /// `BytePos` because it is often much larger than any of the
+        /// differences.
+        line_start: BytePos,
+
+        /// Always 1, 2, or 4. Always as small as possible, while being big
+        /// enough to hold the length of the longest line in the source file.
+        /// The 1 case is by far the most common.
+        bytes_per_diff: usize,
+
+        /// The number of diffs encoded in `raw_diffs`. Always one less than
+        /// the number of lines in the source file.
+        num_diffs: usize,
+
+        /// The diffs in "raw" form. Each segment of `bytes_per_diff` length
+        /// encodes one little-endian diff. Note that they aren't LEB128
+        /// encoded. This makes for much faster decoding. Besides, the
+        /// bytes_per_diff==1 case is by far the most common, and LEB128
+        /// encoding has no effect on that case.
+        raw_diffs: Vec<u8>,
+    },
+}
+
 /// A single source in the [`SourceMap`].
 #[derive(Clone)]
 pub struct SourceFile {
@ -1241,7 +1277,7 @@ pub struct SourceFile {
    /// The end position of this source in the `SourceMap`.
    pub end_pos: BytePos,
    /// Locations of lines beginnings in the source code.
-    pub lines: Vec<BytePos>,
+    pub lines: Lock<SourceFileLines>,
    /// Locations of multi-byte characters in the source code.
    pub multibyte_chars: Vec<MultiByteChar>,
    /// Width of characters that are not narrow in the source code.
@ -1262,64 +1298,64 @@ impl<S: Encoder> Encodable<S> for SourceFile {
            s.emit_struct_field("start_pos", false, |s| self.start_pos.encode(s))?;
            s.emit_struct_field("end_pos", false, |s| self.end_pos.encode(s))?;
            s.emit_struct_field("lines", false, |s| {
-                let lines = &self.lines[..];
-                // Store the length.
-                s.emit_u32(lines.len() as u32)?;
+                self.lines(|lines| {
+                    // Store the length.
+                    s.emit_u32(lines.len() as u32)?;

-                if !lines.is_empty() {
-                    // In order to preserve some space, we exploit the fact that
-                    // the lines list is sorted and individual lines are
-                    // probably not that long. Because of that we can store lines
-                    // as a difference list, using as little space as possible
-                    // for the differences. But note that the first line is
-                    // always encoded as a `BytePos` because its position is
-                    // often much larger than any of the differences.
-                    let max_line_length = if lines.len() == 1 {
-                        0
-                    } else {
-                        lines
-                            .array_windows()
-                            .map(|&[fst, snd]| snd - fst)
-                            .map(|bp| bp.to_usize())
-                            .max()
-                            .unwrap()
-                    };
+                    // Compute and store the difference list.
+                    if lines.len() != 0 {
+                        let max_line_length = if lines.len() == 1 {
+                            0
+                        } else {
+                            lines
+                                .array_windows()
+                                .map(|&[fst, snd]| snd - fst)
+                                .map(|bp| bp.to_usize())
+                                .max()
+                                .unwrap()
+                        };

-                    let bytes_per_diff: u8 = match max_line_length {
-                        0..=0xFF => 1,
-                        0x100..=0xFFFF => 2,
-                        _ => 4,
-                    };
+                        let bytes_per_diff: usize = match max_line_length {
+                            0..=0xFF => 1,
+                            0x100..=0xFFFF => 2,
+                            _ => 4,
+                        };

-                    // Encode the number of bytes used per diff.
-                    bytes_per_diff.encode(s)?;
+                        // Encode the number of bytes used per diff.
+                        s.emit_u8(bytes_per_diff as u8)?;

-                    // Encode the first element.
-                    lines[0].encode(s)?;
+                        // Encode the first element.
+                        lines[0].encode(s)?;

-                    let diff_iter = lines.array_windows().map(|&[fst, snd]| snd - fst);
-
-                    match bytes_per_diff {
-                        1 => {
-                            for diff in diff_iter {
-                                (diff.0 as u8).encode(s)?
+                        // Encode the difference list.
+                        let diff_iter = lines.array_windows().map(|&[fst, snd]| snd - fst);
+                        let num_diffs = lines.len() - 1;
+                        let mut raw_diffs;
+                        match bytes_per_diff {
+                            1 => {
+                                raw_diffs = Vec::with_capacity(num_diffs);
+                                for diff in diff_iter {
+                                    raw_diffs.push(diff.0 as u8);
+                                }
                            }
-                        }
-                        2 => {
-                            for diff in diff_iter {
-                                (diff.0 as u16).encode(s)?
+                            2 => {
+                                raw_diffs = Vec::with_capacity(bytes_per_diff * num_diffs);
+                                for diff in diff_iter {
+                                    raw_diffs.extend_from_slice(&(diff.0 as u16).to_le_bytes());
+                                }
                            }
-                        }
-                        4 => {
-                            for diff in diff_iter {
-                                diff.0.encode(s)?
+                            4 => {
+                                raw_diffs = Vec::with_capacity(bytes_per_diff * num_diffs);
+                                for diff in diff_iter {
+                                    raw_diffs.extend_from_slice(&(diff.0 as u32).to_le_bytes());
+                                }
                            }
+                            _ => unreachable!(),
                        }
-                        _ => unreachable!(),
+                        s.emit_raw_bytes(&raw_diffs)?;
                    }
-                }
-
-                Ok(())
+                    Ok(())
+                })
            })?;
            s.emit_struct_field("multibyte_chars", false, |s| self.multibyte_chars.encode(s))?;
            s.emit_struct_field("non_narrow_chars", false, |s| self.non_narrow_chars.encode(s))?;
@ -1336,36 +1372,22 @@ impl<D: Decoder> Decodable<D> for SourceFile {
        let src_hash: SourceFileHash = Decodable::decode(d);
        let start_pos: BytePos = Decodable::decode(d);
        let end_pos: BytePos = Decodable::decode(d);
-        let lines: Vec<BytePos> = {
+        let lines = {
            let num_lines: u32 = Decodable::decode(d);
-            let mut lines = Vec::with_capacity(num_lines as usize);
-
            if num_lines > 0 {
                // Read the number of bytes used per diff.
-                let bytes_per_diff: u8 = Decodable::decode(d);
+                let bytes_per_diff = d.read_u8() as usize;

                // Read the first element.
-                let mut line_start: BytePos = Decodable::decode(d);
-                lines.push(line_start);
+                let line_start: BytePos = Decodable::decode(d);

-                match bytes_per_diff {
-                    1 => lines.extend((1..num_lines).map(|_| {
-                        line_start = line_start + BytePos(d.read_u8() as u32);
-                        line_start
-                    })),
-                    2 => lines.extend((1..num_lines).map(|_| {
-                        line_start = line_start + BytePos(d.read_u16() as u32);
-                        line_start
-                    })),
-                    4 => lines.extend((1..num_lines).map(|_| {
-                        line_start = line_start + BytePos(d.read_u32());
-                        line_start
-                    })),
-                    _ => unreachable!(),
-                }
+                // Read the difference list.
+                let num_diffs = num_lines as usize - 1;
+                let raw_diffs = d.read_raw_bytes(bytes_per_diff * num_diffs).to_vec();
+                SourceFileLines::Diffs { line_start, bytes_per_diff, num_diffs, raw_diffs }
+            } else {
+                SourceFileLines::Lines { lines: vec![] }
            }
-
-            lines
        };
        let multibyte_chars: Vec<MultiByteChar> = Decodable::decode(d);
        let non_narrow_chars: Vec<NonNarrowChar> = Decodable::decode(d);
@ -1381,7 +1403,7 @@ impl<D: Decoder> Decodable<D> for SourceFile {
            // Unused - the metadata decoder will construct
            // a new SourceFile, filling in `external_src` properly
            external_src: Lock::new(ExternalSource::Unneeded),
-            lines,
+            lines: Lock::new(lines),
            multibyte_chars,
            non_narrow_chars,
            normalized_pos,
@ -1426,7 +1448,7 @@ impl SourceFile {
            external_src: Lock::new(ExternalSource::Unneeded),
            start_pos,
            end_pos: Pos::from_usize(end_pos),
-            lines,
+            lines: Lock::new(SourceFileLines::Lines { lines }),
            multibyte_chars,
            non_narrow_chars,
            normalized_pos,
@ -1435,10 +1457,63 @@ impl SourceFile {
        }
    }

+    pub fn lines<F, R>(&self, mut f: F) -> R
+    where
+        F: FnMut(&[BytePos]) -> R,
+    {
+        let mut guard = self.lines.borrow_mut();
+        match &*guard {
+            SourceFileLines::Lines { lines } => f(lines),
+            SourceFileLines::Diffs { mut line_start, bytes_per_diff, num_diffs, raw_diffs } => {
+                // Convert from "diffs" form to "lines" form.
+                let num_lines = num_diffs + 1;
+                let mut lines = Vec::with_capacity(num_lines);
+                lines.push(line_start);
+
+                assert_eq!(*num_diffs, raw_diffs.len() / bytes_per_diff);
+                match bytes_per_diff {
+                    1 => {
+                        lines.extend(raw_diffs.into_iter().map(|&diff| {
+                            line_start = line_start + BytePos(diff as u32);
+                            line_start
+                        }));
+                    }
+                    2 => {
+                        lines.extend((0..*num_diffs).map(|i| {
+                            let pos = bytes_per_diff * i;
+                            let bytes = [raw_diffs[pos], raw_diffs[pos + 1]];
+                            let diff = u16::from_le_bytes(bytes);
+                            line_start = line_start + BytePos(diff as u32);
+                            line_start
+                        }));
+                    }
+                    4 => {
+                        lines.extend((0..*num_diffs).map(|i| {
+                            let pos = bytes_per_diff * i;
+                            let bytes = [
+                                raw_diffs[pos],
+                                raw_diffs[pos + 1],
+                                raw_diffs[pos + 2],
+                                raw_diffs[pos + 3],
+                            ];
+                            let diff = u32::from_le_bytes(bytes);
+                            line_start = line_start + BytePos(diff);
+                            line_start
+                        }));
+                    }
+                    _ => unreachable!(),
+                }
+                let res = f(&lines);
+                *guard = SourceFileLines::Lines { lines };
+                res
+            }
+        }
+    }
+
    /// Returns the `BytePos` of the beginning of the current line.
    pub fn line_begin_pos(&self, pos: BytePos) -> BytePos {
        let line_index = self.lookup_line(pos).unwrap();
-        self.lines[line_index]
+        self.lines(|lines| lines[line_index])
    }

    /// Add externally loaded source.
@ -1495,8 +1570,8 @@ impl SourceFile {
        }

        let begin = {
-            let line = self.lines.get(line_number)?;
-            let begin: BytePos = *line - self.start_pos;
+            let line = self.lines(|lines| lines.get(line_number).copied())?;
+            let begin: BytePos = line - self.start_pos;
            begin.to_usize()
        };

@ -1518,7 +1593,7 @@ impl SourceFile {
    }

    pub fn count_lines(&self) -> usize {
-        self.lines.len()
+        self.lines(|lines| lines.len())
    }

    /// Finds the line containing the given position. The return value is the
@ -1526,11 +1601,11 @@ impl SourceFile {
    /// number. If the source_file is empty or the position is located before the
    /// first line, `None` is returned.
    pub fn lookup_line(&self, pos: BytePos) -> Option<usize> {
-        match self.lines.binary_search(&pos) {
+        self.lines(|lines| match lines.binary_search(&pos) {
            Ok(idx) => Some(idx),
            Err(0) => None,
            Err(idx) => Some(idx - 1),
-        }
+        })
    }

    pub fn line_bounds(&self, line_index: usize) -> Range<BytePos> {
@ -1538,12 +1613,14 @@ impl SourceFile {
            return self.start_pos..self.end_pos;
        }

-        assert!(line_index < self.lines.len());
-        if line_index == (self.lines.len() - 1) {
-            self.lines[line_index]..self.end_pos
-        } else {
-            self.lines[line_index]..self.lines[line_index + 1]
-        }
+        self.lines(|lines| {
+            assert!(line_index < lines.len());
+            if line_index == (lines.len() - 1) {
+                lines[line_index]..self.end_pos
+            } else {
+                lines[line_index]..lines[line_index + 1]
+            }
+        })
    }

    /// Returns whether or not the file contains the given `SourceMap` byte
@ -1605,7 +1682,7 @@ impl SourceFile {
        match self.lookup_line(pos) {
            Some(a) => {
                let line = a + 1; // Line numbers start at 1
-                let linebpos = self.lines[a];
+                let linebpos = self.lines(|lines| lines[a]);
                let linechpos = self.bytepos_to_file_charpos(linebpos);
                let col = chpos - linechpos;
                debug!("byte pos {:?} is on the line at byte pos {:?}", pos, linebpos);
@ -1624,7 +1701,7 @@ impl SourceFile {
        let (line, col_or_chpos) = self.lookup_file_pos(pos);
        if line > 0 {
            let col = col_or_chpos;
-            let linebpos = self.lines[line - 1];
+            let linebpos = self.lines(|lines| lines[line - 1]);
            let col_display = {
                let start_width_idx = self
                    .non_narrow_chars
--- a/compiler/rustc_span/src/source_map.rs
+++ b/compiler/rustc_span/src/source_map.rs
@ -331,7 +331,7 @@ impl SourceMap {
        name_hash: u128,
        source_len: usize,
        cnum: CrateNum,
-        mut file_local_lines: Vec<BytePos>,
+        file_local_lines: Lock<SourceFileLines>,
        mut file_local_multibyte_chars: Vec<MultiByteChar>,
        mut file_local_non_narrow_chars: Vec<NonNarrowChar>,
        mut file_local_normalized_pos: Vec<NormalizedPos>,
@ -355,8 +355,15 @@ impl SourceMap {
        // form rather than pre-computing the offset into a local variable. The
        // compiler backend can optimize away the repeated computations in a
        // way that won't trigger overflow checks.
-        for pos in &mut file_local_lines {
-            *pos = (*pos - original_start_pos) + start_pos;
+        match &mut *file_local_lines.borrow_mut() {
+            SourceFileLines::Lines { lines } => {
+                for pos in lines {
+                    *pos = (*pos - original_start_pos) + start_pos;
+                }
+            }
+            SourceFileLines::Diffs { line_start, .. } => {
+                *line_start = (*line_start - original_start_pos) + start_pos;
+            }
        }
        for mbc in &mut file_local_multibyte_chars {
            mbc.pos = (mbc.pos - original_start_pos) + start_pos;
--- a/compiler/rustc_span/src/tests.rs
+++ b/compiler/rustc_span/src/tests.rs
@ -5,7 +5,7 @@ fn test_lookup_line() {
    let source = "abcdefghijklm\nabcdefghij\n...".to_owned();
    let sf =
        SourceFile::new(FileName::Anon(0), source, BytePos(3), SourceFileHashAlgorithm::Sha256);
-    assert_eq!(sf.lines.as_slice(), &[BytePos(3), BytePos(17), BytePos(28)]);
+    sf.lines(|lines| assert_eq!(lines, &[BytePos(3), BytePos(17), BytePos(28)]));

    assert_eq!(sf.lookup_line(BytePos(0)), None);
    assert_eq!(sf.lookup_line(BytePos(3)), Some(0));
--- a/src/tools/clippy/clippy_lints/src/undocumented_unsafe_blocks.rs
+++ b/src/tools/clippy/clippy_lints/src/undocumented_unsafe_blocks.rs
@ -187,11 +187,13 @@ fn item_has_safety_comment(cx: &LateContext<'_>, item: &hir::Item<'_>) -> bool {
                && Lrc::ptr_eq(&unsafe_line.sf, &comment_start_line.sf)
                && let Some(src) = unsafe_line.sf.src.as_deref()
            {
-                comment_start_line.line < unsafe_line.line && text_has_safety_comment(
-                    src,
-                    &unsafe_line.sf.lines[comment_start_line.line + 1..=unsafe_line.line],
-                    unsafe_line.sf.start_pos.to_usize(),
-                )
+                unsafe_line.sf.lines(|lines| {
+                    comment_start_line.line < unsafe_line.line && text_has_safety_comment(
+                        src,
+                        &lines[comment_start_line.line + 1..=unsafe_line.line],
+                        unsafe_line.sf.start_pos.to_usize(),
+                    )
+                })
            } else {
                // Problem getting source text. Pretend a comment was found.
                true
@ -249,11 +251,13 @@ fn span_from_macro_expansion_has_safety_comment(cx: &LateContext<'_>, span: Span
            && Lrc::ptr_eq(&unsafe_line.sf, &macro_line.sf)
            && let Some(src) = unsafe_line.sf.src.as_deref()
        {
-            macro_line.line < unsafe_line.line && text_has_safety_comment(
-                src,
-                &unsafe_line.sf.lines[macro_line.line + 1..=unsafe_line.line],
-                unsafe_line.sf.start_pos.to_usize(),
-            )
+            unsafe_line.sf.lines(|lines| {
+                macro_line.line < unsafe_line.line && text_has_safety_comment(
+                    src,
+                    &lines[macro_line.line + 1..=unsafe_line.line],
+                    unsafe_line.sf.start_pos.to_usize(),
+                )
+            })
        } else {
            // Problem getting source text. Pretend a comment was found.
            true
@ -276,11 +280,13 @@ fn span_in_body_has_safety_comment(cx: &LateContext<'_>, span: Span) -> bool {
            // Get the text from the start of function body to the unsafe block.
            //     fn foo() { some_stuff; unsafe { stuff }; other_stuff; }
            //              ^-------------^
-            body_line.line < unsafe_line.line && text_has_safety_comment(
-                src,
-                &unsafe_line.sf.lines[body_line.line + 1..=unsafe_line.line],
-                unsafe_line.sf.start_pos.to_usize(),
-            )
+            unsafe_line.sf.lines(|lines| {
+                body_line.line < unsafe_line.line && text_has_safety_comment(
+                    src,
+                    &lines[body_line.line + 1..=unsafe_line.line],
+                    unsafe_line.sf.start_pos.to_usize(),
+                )
+            })
        } else {
            // Problem getting source text. Pretend a comment was found.
            true
--- a/src/tools/clippy/clippy_utils/src/diagnostics.rs
+++ b/src/tools/clippy/clippy_utils/src/diagnostics.rs
@ -283,10 +283,10 @@ pub fn span_lint_and_sugg_for_edges(
            {
                let split_idx = MAX_SUGGESTION_HIGHLIGHT_LINES / 2;
                let span_upper = sm.span_until_char(
-                    sp.with_hi(line_upper.sf.lines[line_upper.line + split_idx]),
+                    sp.with_hi(line_upper.sf.lines(|lines| lines[line_upper.line + split_idx])),
                    '\n',
                );
-                let span_bottom = sp.with_lo(line_bottom.sf.lines[line_bottom.line - split_idx]);
+                let span_bottom = sp.with_lo(line_bottom.sf.lines(|lines| lines[line_bottom.line - split_idx]));

                let sugg_lines_vec = sugg.lines().collect::<Vec<&str>>();
                let sugg_upper = sugg_lines_vec[..split_idx].join("\n");
--- a/src/tools/clippy/clippy_utils/src/lib.rs
+++ b/src/tools/clippy/clippy_utils/src/lib.rs
@ -1149,7 +1149,7 @@ fn line_span<T: LintContext>(cx: &T, span: Span) -> Span {
    let span = original_sp(span, DUMMY_SP);
    let source_map_and_line = cx.sess().source_map().lookup_line(span.lo()).unwrap();
    let line_no = source_map_and_line.line;
-    let line_start = source_map_and_line.sf.lines[line_no];
+    let line_start = source_map_and_line.sf.lines(|lines| lines[line_no]);
    span.with_lo(line_start)
 }