1
Fork 0

Lazify SourceFile::lines.

`SourceFile::lines` is a big part of metadata. It's stored in a compressed form
(a difference list) to save disk space. Decoding it is a big fraction of
compile time for very small crates/programs.

This commit introduces a new type `SourceFileLines` which has a `Lines`
form and a `Diffs` form. The latter is used when the metadata is first
read, and it is only decoded into the `Lines` form when line data is
actually needed. This avoids the decoding cost for many files,
especially in `std`. It's a performance win of up to 15% for tiny
crates/programs where metadata decoding is a high part of compilation
costs.

A `Lock` is needed because the methods that access lines data (which can
trigger decoding) take `&self` rather than `&mut self`. To allow for this,
`SourceFile::lines` now takes a `FnMut` that operates on the lines slice rather
than returning the lines slice.
This commit is contained in:
Nicholas Nethercote 2022-05-30 15:59:45 +10:00
parent bef2b7cd1c
commit 0b81d7cdc6
8 changed files with 213 additions and 116 deletions

View file

@ -713,7 +713,7 @@ impl<'a, 'tcx> Decodable<CacheDecoder<'a, 'tcx>> for Span {
let len = BytePos::decode(decoder);
let file_lo = decoder.file_index_to_file(file_lo_index);
let lo = file_lo.lines[line_lo - 1] + col_lo;
let lo = file_lo.lines(|lines| lines[line_lo - 1] + col_lo);
let hi = lo + len;
Span::new(lo, hi, ctxt, parent)

View file

@ -5,7 +5,7 @@ use crate::ich::StableHashingContext;
use rustc_ast as ast;
use rustc_data_structures::stable_hasher::{HashStable, StableHasher};
use rustc_span::{BytePos, NormalizedPos, SourceFile};
use rustc_span::{BytePos, NormalizedPos, SourceFile, SourceFileLines};
use std::assert_matches::assert_matches;
use smallvec::SmallVec;
@ -60,7 +60,7 @@ impl<'ctx> rustc_ast::HashStableContext for StableHashingContext<'ctx> {
impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
fn hash_stable(&self, hcx: &mut StableHashingContext<'a>, hasher: &mut StableHasher) {
let SourceFile {
name: _, // We hash the smaller name_hash instead of this
ref name, // We hash the smaller name_hash instead of this
name_hash,
cnum,
// Do not hash the source as it is not encoded
@ -80,9 +80,16 @@ impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
src_hash.hash_stable(hcx, hasher);
// We only hash the relative position within this source_file
lines.len().hash_stable(hcx, hasher);
for &line in lines.iter() {
stable_byte_pos(line, start_pos).hash_stable(hcx, hasher);
match &*lines.borrow() {
SourceFileLines::Lines { lines } => {
lines.len().hash_stable(hcx, hasher);
for &line in lines.iter() {
stable_byte_pos(line, start_pos).hash_stable(hcx, hasher);
}
}
SourceFileLines::Diffs { .. } => {
panic!("called hash_stable on SourceFileLines::Diffs for {:?}", name);
}
}
// We only hash the relative position within this source_file

View file

@ -1222,6 +1222,42 @@ impl DebuggerVisualizerFile {
}
}
#[derive(Clone)]
pub enum SourceFileLines {
/// The source file lines, in decoded (random-access) form.
Lines { lines: Vec<BytePos> },
/// The source file lines in difference list form. This matches the form
/// used within metadata, which saves space by exploiting the fact that the
/// lines list is sorted and individual lines are usually not that long.
///
/// We read it directly from metadata and only decode it into `Lines` form
/// when necessary. This is a significant performance win, especially for
/// small crates where very little of `std`'s metadata is used.
Diffs {
/// Position of the first line. Note that this is always encoded as a
/// `BytePos` because it is often much larger than any of the
/// differences.
line_start: BytePos,
/// Always 1, 2, or 4. Always as small as possible, while being big
/// enough to hold the length of the longest line in the source file.
/// The 1 case is by far the most common.
bytes_per_diff: usize,
/// The number of diffs encoded in `raw_diffs`. Always one less than
/// the number of lines in the source file.
num_diffs: usize,
/// The diffs in "raw" form. Each segment of `bytes_per_diff` length
/// encodes one little-endian diff. Note that they aren't LEB128
/// encoded. This makes for much faster decoding. Besides, the
/// bytes_per_diff==1 case is by far the most common, and LEB128
/// encoding has no effect on that case.
raw_diffs: Vec<u8>,
},
}
/// A single source in the [`SourceMap`].
#[derive(Clone)]
pub struct SourceFile {
@ -1241,7 +1277,7 @@ pub struct SourceFile {
/// The end position of this source in the `SourceMap`.
pub end_pos: BytePos,
/// Locations of lines beginnings in the source code.
pub lines: Vec<BytePos>,
pub lines: Lock<SourceFileLines>,
/// Locations of multi-byte characters in the source code.
pub multibyte_chars: Vec<MultiByteChar>,
/// Width of characters that are not narrow in the source code.
@ -1262,64 +1298,64 @@ impl<S: Encoder> Encodable<S> for SourceFile {
s.emit_struct_field("start_pos", false, |s| self.start_pos.encode(s))?;
s.emit_struct_field("end_pos", false, |s| self.end_pos.encode(s))?;
s.emit_struct_field("lines", false, |s| {
let lines = &self.lines[..];
// Store the length.
s.emit_u32(lines.len() as u32)?;
self.lines(|lines| {
// Store the length.
s.emit_u32(lines.len() as u32)?;
if !lines.is_empty() {
// In order to preserve some space, we exploit the fact that
// the lines list is sorted and individual lines are
// probably not that long. Because of that we can store lines
// as a difference list, using as little space as possible
// for the differences. But note that the first line is
// always encoded as a `BytePos` because its position is
// often much larger than any of the differences.
let max_line_length = if lines.len() == 1 {
0
} else {
lines
.array_windows()
.map(|&[fst, snd]| snd - fst)
.map(|bp| bp.to_usize())
.max()
.unwrap()
};
// Compute and store the difference list.
if lines.len() != 0 {
let max_line_length = if lines.len() == 1 {
0
} else {
lines
.array_windows()
.map(|&[fst, snd]| snd - fst)
.map(|bp| bp.to_usize())
.max()
.unwrap()
};
let bytes_per_diff: u8 = match max_line_length {
0..=0xFF => 1,
0x100..=0xFFFF => 2,
_ => 4,
};
let bytes_per_diff: usize = match max_line_length {
0..=0xFF => 1,
0x100..=0xFFFF => 2,
_ => 4,
};
// Encode the number of bytes used per diff.
bytes_per_diff.encode(s)?;
// Encode the number of bytes used per diff.
s.emit_u8(bytes_per_diff as u8)?;
// Encode the first element.
lines[0].encode(s)?;
// Encode the first element.
lines[0].encode(s)?;
let diff_iter = lines.array_windows().map(|&[fst, snd]| snd - fst);
match bytes_per_diff {
1 => {
for diff in diff_iter {
(diff.0 as u8).encode(s)?
// Encode the difference list.
let diff_iter = lines.array_windows().map(|&[fst, snd]| snd - fst);
let num_diffs = lines.len() - 1;
let mut raw_diffs;
match bytes_per_diff {
1 => {
raw_diffs = Vec::with_capacity(num_diffs);
for diff in diff_iter {
raw_diffs.push(diff.0 as u8);
}
}
}
2 => {
for diff in diff_iter {
(diff.0 as u16).encode(s)?
2 => {
raw_diffs = Vec::with_capacity(bytes_per_diff * num_diffs);
for diff in diff_iter {
raw_diffs.extend_from_slice(&(diff.0 as u16).to_le_bytes());
}
}
}
4 => {
for diff in diff_iter {
diff.0.encode(s)?
4 => {
raw_diffs = Vec::with_capacity(bytes_per_diff * num_diffs);
for diff in diff_iter {
raw_diffs.extend_from_slice(&(diff.0 as u32).to_le_bytes());
}
}
_ => unreachable!(),
}
_ => unreachable!(),
s.emit_raw_bytes(&raw_diffs)?;
}
}
Ok(())
Ok(())
})
})?;
s.emit_struct_field("multibyte_chars", false, |s| self.multibyte_chars.encode(s))?;
s.emit_struct_field("non_narrow_chars", false, |s| self.non_narrow_chars.encode(s))?;
@ -1336,36 +1372,22 @@ impl<D: Decoder> Decodable<D> for SourceFile {
let src_hash: SourceFileHash = Decodable::decode(d);
let start_pos: BytePos = Decodable::decode(d);
let end_pos: BytePos = Decodable::decode(d);
let lines: Vec<BytePos> = {
let lines = {
let num_lines: u32 = Decodable::decode(d);
let mut lines = Vec::with_capacity(num_lines as usize);
if num_lines > 0 {
// Read the number of bytes used per diff.
let bytes_per_diff: u8 = Decodable::decode(d);
let bytes_per_diff = d.read_u8() as usize;
// Read the first element.
let mut line_start: BytePos = Decodable::decode(d);
lines.push(line_start);
let line_start: BytePos = Decodable::decode(d);
match bytes_per_diff {
1 => lines.extend((1..num_lines).map(|_| {
line_start = line_start + BytePos(d.read_u8() as u32);
line_start
})),
2 => lines.extend((1..num_lines).map(|_| {
line_start = line_start + BytePos(d.read_u16() as u32);
line_start
})),
4 => lines.extend((1..num_lines).map(|_| {
line_start = line_start + BytePos(d.read_u32());
line_start
})),
_ => unreachable!(),
}
// Read the difference list.
let num_diffs = num_lines as usize - 1;
let raw_diffs = d.read_raw_bytes(bytes_per_diff * num_diffs).to_vec();
SourceFileLines::Diffs { line_start, bytes_per_diff, num_diffs, raw_diffs }
} else {
SourceFileLines::Lines { lines: vec![] }
}
lines
};
let multibyte_chars: Vec<MultiByteChar> = Decodable::decode(d);
let non_narrow_chars: Vec<NonNarrowChar> = Decodable::decode(d);
@ -1381,7 +1403,7 @@ impl<D: Decoder> Decodable<D> for SourceFile {
// Unused - the metadata decoder will construct
// a new SourceFile, filling in `external_src` properly
external_src: Lock::new(ExternalSource::Unneeded),
lines,
lines: Lock::new(lines),
multibyte_chars,
non_narrow_chars,
normalized_pos,
@ -1426,7 +1448,7 @@ impl SourceFile {
external_src: Lock::new(ExternalSource::Unneeded),
start_pos,
end_pos: Pos::from_usize(end_pos),
lines,
lines: Lock::new(SourceFileLines::Lines { lines }),
multibyte_chars,
non_narrow_chars,
normalized_pos,
@ -1435,10 +1457,63 @@ impl SourceFile {
}
}
pub fn lines<F, R>(&self, mut f: F) -> R
where
F: FnMut(&[BytePos]) -> R,
{
let mut guard = self.lines.borrow_mut();
match &*guard {
SourceFileLines::Lines { lines } => f(lines),
SourceFileLines::Diffs { mut line_start, bytes_per_diff, num_diffs, raw_diffs } => {
// Convert from "diffs" form to "lines" form.
let num_lines = num_diffs + 1;
let mut lines = Vec::with_capacity(num_lines);
lines.push(line_start);
assert_eq!(*num_diffs, raw_diffs.len() / bytes_per_diff);
match bytes_per_diff {
1 => {
lines.extend(raw_diffs.into_iter().map(|&diff| {
line_start = line_start + BytePos(diff as u32);
line_start
}));
}
2 => {
lines.extend((0..*num_diffs).map(|i| {
let pos = bytes_per_diff * i;
let bytes = [raw_diffs[pos], raw_diffs[pos + 1]];
let diff = u16::from_le_bytes(bytes);
line_start = line_start + BytePos(diff as u32);
line_start
}));
}
4 => {
lines.extend((0..*num_diffs).map(|i| {
let pos = bytes_per_diff * i;
let bytes = [
raw_diffs[pos],
raw_diffs[pos + 1],
raw_diffs[pos + 2],
raw_diffs[pos + 3],
];
let diff = u32::from_le_bytes(bytes);
line_start = line_start + BytePos(diff);
line_start
}));
}
_ => unreachable!(),
}
let res = f(&lines);
*guard = SourceFileLines::Lines { lines };
res
}
}
}
/// Returns the `BytePos` of the beginning of the current line.
pub fn line_begin_pos(&self, pos: BytePos) -> BytePos {
let line_index = self.lookup_line(pos).unwrap();
self.lines[line_index]
self.lines(|lines| lines[line_index])
}
/// Add externally loaded source.
@ -1495,8 +1570,8 @@ impl SourceFile {
}
let begin = {
let line = self.lines.get(line_number)?;
let begin: BytePos = *line - self.start_pos;
let line = self.lines(|lines| lines.get(line_number).copied())?;
let begin: BytePos = line - self.start_pos;
begin.to_usize()
};
@ -1518,7 +1593,7 @@ impl SourceFile {
}
pub fn count_lines(&self) -> usize {
self.lines.len()
self.lines(|lines| lines.len())
}
/// Finds the line containing the given position. The return value is the
@ -1526,11 +1601,11 @@ impl SourceFile {
/// number. If the source_file is empty or the position is located before the
/// first line, `None` is returned.
pub fn lookup_line(&self, pos: BytePos) -> Option<usize> {
match self.lines.binary_search(&pos) {
self.lines(|lines| match lines.binary_search(&pos) {
Ok(idx) => Some(idx),
Err(0) => None,
Err(idx) => Some(idx - 1),
}
})
}
pub fn line_bounds(&self, line_index: usize) -> Range<BytePos> {
@ -1538,12 +1613,14 @@ impl SourceFile {
return self.start_pos..self.end_pos;
}
assert!(line_index < self.lines.len());
if line_index == (self.lines.len() - 1) {
self.lines[line_index]..self.end_pos
} else {
self.lines[line_index]..self.lines[line_index + 1]
}
self.lines(|lines| {
assert!(line_index < lines.len());
if line_index == (lines.len() - 1) {
lines[line_index]..self.end_pos
} else {
lines[line_index]..lines[line_index + 1]
}
})
}
/// Returns whether or not the file contains the given `SourceMap` byte
@ -1605,7 +1682,7 @@ impl SourceFile {
match self.lookup_line(pos) {
Some(a) => {
let line = a + 1; // Line numbers start at 1
let linebpos = self.lines[a];
let linebpos = self.lines(|lines| lines[a]);
let linechpos = self.bytepos_to_file_charpos(linebpos);
let col = chpos - linechpos;
debug!("byte pos {:?} is on the line at byte pos {:?}", pos, linebpos);
@ -1624,7 +1701,7 @@ impl SourceFile {
let (line, col_or_chpos) = self.lookup_file_pos(pos);
if line > 0 {
let col = col_or_chpos;
let linebpos = self.lines[line - 1];
let linebpos = self.lines(|lines| lines[line - 1]);
let col_display = {
let start_width_idx = self
.non_narrow_chars

View file

@ -331,7 +331,7 @@ impl SourceMap {
name_hash: u128,
source_len: usize,
cnum: CrateNum,
mut file_local_lines: Vec<BytePos>,
file_local_lines: Lock<SourceFileLines>,
mut file_local_multibyte_chars: Vec<MultiByteChar>,
mut file_local_non_narrow_chars: Vec<NonNarrowChar>,
mut file_local_normalized_pos: Vec<NormalizedPos>,
@ -355,8 +355,15 @@ impl SourceMap {
// form rather than pre-computing the offset into a local variable. The
// compiler backend can optimize away the repeated computations in a
// way that won't trigger overflow checks.
for pos in &mut file_local_lines {
*pos = (*pos - original_start_pos) + start_pos;
match &mut *file_local_lines.borrow_mut() {
SourceFileLines::Lines { lines } => {
for pos in lines {
*pos = (*pos - original_start_pos) + start_pos;
}
}
SourceFileLines::Diffs { line_start, .. } => {
*line_start = (*line_start - original_start_pos) + start_pos;
}
}
for mbc in &mut file_local_multibyte_chars {
mbc.pos = (mbc.pos - original_start_pos) + start_pos;

View file

@ -5,7 +5,7 @@ fn test_lookup_line() {
let source = "abcdefghijklm\nabcdefghij\n...".to_owned();
let sf =
SourceFile::new(FileName::Anon(0), source, BytePos(3), SourceFileHashAlgorithm::Sha256);
assert_eq!(sf.lines.as_slice(), &[BytePos(3), BytePos(17), BytePos(28)]);
sf.lines(|lines| assert_eq!(lines, &[BytePos(3), BytePos(17), BytePos(28)]));
assert_eq!(sf.lookup_line(BytePos(0)), None);
assert_eq!(sf.lookup_line(BytePos(3)), Some(0));

View file

@ -187,11 +187,13 @@ fn item_has_safety_comment(cx: &LateContext<'_>, item: &hir::Item<'_>) -> bool {
&& Lrc::ptr_eq(&unsafe_line.sf, &comment_start_line.sf)
&& let Some(src) = unsafe_line.sf.src.as_deref()
{
comment_start_line.line < unsafe_line.line && text_has_safety_comment(
src,
&unsafe_line.sf.lines[comment_start_line.line + 1..=unsafe_line.line],
unsafe_line.sf.start_pos.to_usize(),
)
unsafe_line.sf.lines(|lines| {
comment_start_line.line < unsafe_line.line && text_has_safety_comment(
src,
&lines[comment_start_line.line + 1..=unsafe_line.line],
unsafe_line.sf.start_pos.to_usize(),
)
})
} else {
// Problem getting source text. Pretend a comment was found.
true
@ -249,11 +251,13 @@ fn span_from_macro_expansion_has_safety_comment(cx: &LateContext<'_>, span: Span
&& Lrc::ptr_eq(&unsafe_line.sf, &macro_line.sf)
&& let Some(src) = unsafe_line.sf.src.as_deref()
{
macro_line.line < unsafe_line.line && text_has_safety_comment(
src,
&unsafe_line.sf.lines[macro_line.line + 1..=unsafe_line.line],
unsafe_line.sf.start_pos.to_usize(),
)
unsafe_line.sf.lines(|lines| {
macro_line.line < unsafe_line.line && text_has_safety_comment(
src,
&lines[macro_line.line + 1..=unsafe_line.line],
unsafe_line.sf.start_pos.to_usize(),
)
})
} else {
// Problem getting source text. Pretend a comment was found.
true
@ -276,11 +280,13 @@ fn span_in_body_has_safety_comment(cx: &LateContext<'_>, span: Span) -> bool {
// Get the text from the start of function body to the unsafe block.
// fn foo() { some_stuff; unsafe { stuff }; other_stuff; }
// ^-------------^
body_line.line < unsafe_line.line && text_has_safety_comment(
src,
&unsafe_line.sf.lines[body_line.line + 1..=unsafe_line.line],
unsafe_line.sf.start_pos.to_usize(),
)
unsafe_line.sf.lines(|lines| {
body_line.line < unsafe_line.line && text_has_safety_comment(
src,
&lines[body_line.line + 1..=unsafe_line.line],
unsafe_line.sf.start_pos.to_usize(),
)
})
} else {
// Problem getting source text. Pretend a comment was found.
true

View file

@ -283,10 +283,10 @@ pub fn span_lint_and_sugg_for_edges(
{
let split_idx = MAX_SUGGESTION_HIGHLIGHT_LINES / 2;
let span_upper = sm.span_until_char(
sp.with_hi(line_upper.sf.lines[line_upper.line + split_idx]),
sp.with_hi(line_upper.sf.lines(|lines| lines[line_upper.line + split_idx])),
'\n',
);
let span_bottom = sp.with_lo(line_bottom.sf.lines[line_bottom.line - split_idx]);
let span_bottom = sp.with_lo(line_bottom.sf.lines(|lines| lines[line_bottom.line - split_idx]));
let sugg_lines_vec = sugg.lines().collect::<Vec<&str>>();
let sugg_upper = sugg_lines_vec[..split_idx].join("\n");

View file

@ -1149,7 +1149,7 @@ fn line_span<T: LintContext>(cx: &T, span: Span) -> Span {
let span = original_sp(span, DUMMY_SP);
let source_map_and_line = cx.sess().source_map().lookup_line(span.lo()).unwrap();
let line_no = source_map_and_line.line;
let line_start = source_map_and_line.sf.lines[line_no];
let line_start = source_map_and_line.sf.lines(|lines| lines[line_no]);
span.with_lo(line_start)
}