1
Fork 0

Be more accurate about calculating display_col from a BytePos

No longer track "zero-width" chars in `SourceMap`, read directly from the line when calculating the `display_col` of a `BytePos`. Move `char_width` to `rustc_span` and use it from the emitter.

This change allows the following to properly align in terminals (depending on the font, the replaced control codepoints are rendered as 1 or 2 width, on my terminal they are rendered as 1, on VSCode text they are rendered as 2):

```
error: this file contains an unclosed delimiter
  --> $DIR/issue-68629.rs:5:17
   |
LL | ␜␟ts␀![{i
   |       -- unclosed delimiter
   |       |
   |       unclosed delimiter
LL | ␀␀  fn rݻoa>rݻm
   |                ^
```
This commit is contained in:
Esteban Küber 2024-07-09 17:00:19 +00:00
parent 89f273f40d
commit 2d7795dfb9
60 changed files with 134 additions and 278 deletions

View file

@ -1345,68 +1345,6 @@ pub struct MultiByteChar {
pub bytes: u8,
}
/// Identifies an offset of a non-narrow character in a `SourceFile`.
#[derive(Copy, Clone, Encodable, Decodable, Eq, PartialEq, Debug, HashStable_Generic)]
pub enum NonNarrowChar {
/// Represents a zero-width character.
ZeroWidth(RelativeBytePos),
/// Represents a wide (full-width) character.
Wide(RelativeBytePos),
/// Represents a tab character, represented visually with a width of 4 characters.
Tab(RelativeBytePos),
}
impl NonNarrowChar {
fn new(pos: RelativeBytePos, width: usize) -> Self {
match width {
0 => NonNarrowChar::ZeroWidth(pos),
2 => NonNarrowChar::Wide(pos),
4 => NonNarrowChar::Tab(pos),
_ => panic!("width {width} given for non-narrow character"),
}
}
/// Returns the relative offset of the character in the `SourceFile`.
pub fn pos(&self) -> RelativeBytePos {
match *self {
NonNarrowChar::ZeroWidth(p) | NonNarrowChar::Wide(p) | NonNarrowChar::Tab(p) => p,
}
}
/// Returns the width of the character, 0 (zero-width) or 2 (wide).
pub fn width(&self) -> usize {
match *self {
NonNarrowChar::ZeroWidth(_) => 0,
NonNarrowChar::Wide(_) => 2,
NonNarrowChar::Tab(_) => 4,
}
}
}
impl Add<RelativeBytePos> for NonNarrowChar {
type Output = Self;
fn add(self, rhs: RelativeBytePos) -> Self {
match self {
NonNarrowChar::ZeroWidth(pos) => NonNarrowChar::ZeroWidth(pos + rhs),
NonNarrowChar::Wide(pos) => NonNarrowChar::Wide(pos + rhs),
NonNarrowChar::Tab(pos) => NonNarrowChar::Tab(pos + rhs),
}
}
}
impl Sub<RelativeBytePos> for NonNarrowChar {
type Output = Self;
fn sub(self, rhs: RelativeBytePos) -> Self {
match self {
NonNarrowChar::ZeroWidth(pos) => NonNarrowChar::ZeroWidth(pos - rhs),
NonNarrowChar::Wide(pos) => NonNarrowChar::Wide(pos - rhs),
NonNarrowChar::Tab(pos) => NonNarrowChar::Tab(pos - rhs),
}
}
}
/// Identifies an offset of a character that was normalized away from `SourceFile`.
#[derive(Copy, Clone, Encodable, Decodable, Eq, PartialEq, Debug, HashStable_Generic)]
pub struct NormalizedPos {
@ -1581,8 +1519,6 @@ pub struct SourceFile {
pub lines: FreezeLock<SourceFileLines>,
/// Locations of multi-byte characters in the source code.
pub multibyte_chars: Vec<MultiByteChar>,
/// Width of characters that are not narrow in the source code.
pub non_narrow_chars: Vec<NonNarrowChar>,
/// Locations of characters removed during normalization.
pub normalized_pos: Vec<NormalizedPos>,
/// A hash of the filename & crate-id, used for uniquely identifying source
@ -1604,7 +1540,6 @@ impl Clone for SourceFile {
source_len: self.source_len,
lines: self.lines.clone(),
multibyte_chars: self.multibyte_chars.clone(),
non_narrow_chars: self.non_narrow_chars.clone(),
normalized_pos: self.normalized_pos.clone(),
stable_id: self.stable_id,
cnum: self.cnum,
@ -1679,7 +1614,6 @@ impl<S: SpanEncoder> Encodable<S> for SourceFile {
}
self.multibyte_chars.encode(s);
self.non_narrow_chars.encode(s);
self.stable_id.encode(s);
self.normalized_pos.encode(s);
self.cnum.encode(s);
@ -1706,7 +1640,6 @@ impl<D: SpanDecoder> Decodable<D> for SourceFile {
}
};
let multibyte_chars: Vec<MultiByteChar> = Decodable::decode(d);
let non_narrow_chars: Vec<NonNarrowChar> = Decodable::decode(d);
let stable_id = Decodable::decode(d);
let normalized_pos: Vec<NormalizedPos> = Decodable::decode(d);
let cnum: CrateNum = Decodable::decode(d);
@ -1721,7 +1654,6 @@ impl<D: SpanDecoder> Decodable<D> for SourceFile {
external_src: FreezeLock::frozen(ExternalSource::Unneeded),
lines: FreezeLock::new(lines),
multibyte_chars,
non_narrow_chars,
normalized_pos,
stable_id,
cnum,
@ -1809,8 +1741,7 @@ impl SourceFile {
let source_len = src.len();
let source_len = u32::try_from(source_len).map_err(|_| OffsetOverflowError)?;
let (lines, multibyte_chars, non_narrow_chars) =
analyze_source_file::analyze_source_file(&src);
let (lines, multibyte_chars) = analyze_source_file::analyze_source_file(&src);
Ok(SourceFile {
name,
@ -1821,7 +1752,6 @@ impl SourceFile {
source_len: RelativeBytePos::from_u32(source_len),
lines: FreezeLock::frozen(SourceFileLines::Lines(lines)),
multibyte_chars,
non_narrow_chars,
normalized_pos,
stable_id,
cnum: LOCAL_CRATE,
@ -2130,41 +2060,44 @@ impl SourceFile {
let pos = self.relative_position(pos);
let (line, col_or_chpos) = self.lookup_file_pos(pos);
if line > 0 {
let col = col_or_chpos;
let linebpos = self.lines()[line - 1];
let col_display = {
let start_width_idx = self
.non_narrow_chars
.binary_search_by_key(&linebpos, |x| x.pos())
.unwrap_or_else(|x| x);
let end_width_idx = self
.non_narrow_chars
.binary_search_by_key(&pos, |x| x.pos())
.unwrap_or_else(|x| x);
let special_chars = end_width_idx - start_width_idx;
let non_narrow: usize = self.non_narrow_chars[start_width_idx..end_width_idx]
.iter()
.map(|x| x.width())
.sum();
col.0 - special_chars + non_narrow
let Some(code) = self.get_line(line - 1) else {
// If we don't have the code available, it is ok as a fallback to return the bytepos
// instead of the "display" column, which is only used to properly show underlines
// in the terminal.
// FIXME: we'll want better handling of this in the future for the sake of tools
// that want to use the display col instead of byte offsets to modify Rust code, but
// that is a problem for another day, the previous code was already incorrect for
// both displaying *and* third party tools using the json output naïvely.
tracing::info!("couldn't find line {line} {:?}", self.name);
return (line, col_or_chpos, col_or_chpos.0);
};
(line, col, col_display)
let display_col = code.chars().take(col_or_chpos.0).map(|ch| char_width(ch)).sum();
(line, col_or_chpos, display_col)
} else {
let chpos = col_or_chpos;
let col_display = {
let end_width_idx = self
.non_narrow_chars
.binary_search_by_key(&pos, |x| x.pos())
.unwrap_or_else(|x| x);
let non_narrow: usize =
self.non_narrow_chars[0..end_width_idx].iter().map(|x| x.width()).sum();
chpos.0 - end_width_idx + non_narrow
};
(0, chpos, col_display)
// This is never meant to happen?
(0, col_or_chpos, col_or_chpos.0)
}
}
}
pub fn char_width(ch: char) -> usize {
// FIXME: `unicode_width` sometimes disagrees with terminals on how wide a `char` is. For now,
// just accept that sometimes the code line will be longer than desired.
match ch {
'\t' => 4,
// Keep the following list in sync with `rustc_errors::emitter::OUTPUT_REPLACEMENTS`. These
// are control points that we replace before printing with a visible codepoint for the sake
// of being able to point at them with underlines.
'\u{0000}' | '\u{0001}' | '\u{0002}' | '\u{0003}' | '\u{0004}' | '\u{0005}'
| '\u{0006}' | '\u{0007}' | '\u{0008}' | '\u{000B}' | '\u{000C}' | '\u{000D}'
| '\u{000E}' | '\u{000F}' | '\u{0010}' | '\u{0011}' | '\u{0012}' | '\u{0013}'
| '\u{0014}' | '\u{0015}' | '\u{0016}' | '\u{0017}' | '\u{0018}' | '\u{0019}'
| '\u{001A}' | '\u{001B}' | '\u{001C}' | '\u{001D}' | '\u{001E}' | '\u{001F}'
| '\u{007F}' => 1,
_ => unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1),
}
}
/// Normalizes the source code and records the normalizations.
fn normalize_src(src: &mut String) -> Vec<NormalizedPos> {
let mut normalized_pos = vec![];