1
Fork 0

Simplify raw string error reporting.

This makes `UnvalidatedRawStr` and `ValidatedRawStr` unnecessary and removes 70 lines.
This commit is contained in:
Julian Wollersberger 2020-05-29 17:37:16 +02:00
parent b85e3fe010
commit 5fbbfbbfa9
3 changed files with 85 additions and 216 deletions

View file

@ -29,7 +29,7 @@ mod tests;
use self::LiteralKind::*; use self::LiteralKind::*;
use self::TokenKind::*; use self::TokenKind::*;
use crate::cursor::{Cursor, EOF_CHAR}; use crate::cursor::{Cursor, EOF_CHAR};
use std::convert::TryInto; use std::convert::TryFrom;
/// Parsed token. /// Parsed token.
/// It doesn't contain information about data that has been parsed, /// It doesn't contain information about data that has been parsed,
@ -142,84 +142,24 @@ pub enum LiteralKind {
/// "b"abc"", "b"abc" /// "b"abc"", "b"abc"
ByteStr { terminated: bool }, ByteStr { terminated: bool },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a" /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
RawStr(UnvalidatedRawStr), RawStr { n_hashes: u16, err: Option<RawStrError> },
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a" /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
RawByteStr(UnvalidatedRawStr), RawByteStr { n_hashes: u16, err: Option<RawStrError> },
}
/// Represents something that looks like a raw string, but may have some
/// problems. Use `.validate()` to convert it into something
/// usable.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct UnvalidatedRawStr {
/// The prefix (`r###"`) is valid
valid_start: bool,
/// The postfix (`"###`) is valid
valid_end: bool,
/// The number of leading `#`
n_start_hashes: usize,
/// The number of trailing `#`. `n_end_hashes` <= `n_start_hashes`
n_end_hashes: usize,
/// The offset starting at `r` or `br` where the user may have intended to end the string.
/// Currently, it is the longest sequence of pattern `"#+"`.
possible_terminator_offset: Option<usize>,
} }
/// Error produced validating a raw string. Represents cases like: /// Error produced validating a raw string. Represents cases like:
/// - `r##~"abcde"##`: `LexRawStrError::InvalidStarter` /// - `r##~"abcde"##`: `InvalidStarter`
/// - `r###"abcde"##`: `LexRawStrError::NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)` /// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
/// - Too many `#`s (>65536): `TooManyDelimiters` /// - Too many `#`s (>65535): `TooManyDelimiters`
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum LexRawStrError { pub enum RawStrError {
/// Non `#` characters exist between `r` and `"` eg. `r#~"..` /// Non `#` characters exist between `r` and `"` eg. `r#~"..`
InvalidStarter, InvalidStarter { bad_char: char },
/// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
/// may have intended to terminate it. /// may have intended to terminate it.
NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> }, NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
/// More than 65536 `#`s exist. /// More than 65535 `#`s exist.
TooManyDelimiters, TooManyDelimiters { found: usize },
}
/// Raw String that contains a valid prefix (`#+"`) and postfix (`"#+`) where
/// there are a matching number of `#` characters in both. Note that this will
/// not consume extra trailing `#` characters: `r###"abcde"####` is lexed as a
/// `ValidatedRawString { n_hashes: 3 }` followed by a `#` token.
#[derive(Debug, Eq, PartialEq, Copy, Clone)]
pub struct ValidatedRawStr {
n_hashes: u16,
}
impl ValidatedRawStr {
pub fn num_hashes(&self) -> u16 {
self.n_hashes
}
}
impl UnvalidatedRawStr {
pub fn validate(self) -> Result<ValidatedRawStr, LexRawStrError> {
if !self.valid_start {
return Err(LexRawStrError::InvalidStarter);
}
// Only up to 65535 `#`s are allowed in raw strings
let n_start_safe: u16 =
self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?;
if self.n_start_hashes > self.n_end_hashes || !self.valid_end {
Err(LexRawStrError::NoTerminator {
expected: self.n_start_hashes,
found: self.n_end_hashes,
possible_terminator_offset: self.possible_terminator_offset,
})
} else {
// Since the lexer should never produce a literal with n_end > n_start, if n_start <= n_end,
// they must be equal.
debug_assert_eq!(self.n_start_hashes, self.n_end_hashes);
Ok(ValidatedRawStr { n_hashes: n_start_safe })
}
}
} }
/// Base of numeric literal encoding according to its prefix. /// Base of numeric literal encoding according to its prefix.
@ -354,12 +294,12 @@ impl Cursor<'_> {
'r' => match (self.first(), self.second()) { 'r' => match (self.first(), self.second()) {
('#', c1) if is_id_start(c1) => self.raw_ident(), ('#', c1) if is_id_start(c1) => self.raw_ident(),
('#', _) | ('"', _) => { ('#', _) | ('"', _) => {
let raw_str_i = self.raw_double_quoted_string(1); let (n_hashes, err) = self.raw_double_quoted_string(1);
let suffix_start = self.len_consumed(); let suffix_start = self.len_consumed();
if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes { if err.is_none() {
self.eat_literal_suffix(); self.eat_literal_suffix();
} }
let kind = RawStr(raw_str_i); let kind = RawStr { n_hashes, err };
Literal { kind, suffix_start } Literal { kind, suffix_start }
} }
_ => self.ident(), _ => self.ident(),
@ -389,14 +329,12 @@ impl Cursor<'_> {
} }
('r', '"') | ('r', '#') => { ('r', '"') | ('r', '#') => {
self.bump(); self.bump();
let raw_str_i = self.raw_double_quoted_string(2); let (n_hashes, err) = self.raw_double_quoted_string(2);
let suffix_start = self.len_consumed(); let suffix_start = self.len_consumed();
let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes; if err.is_none() {
if terminated {
self.eat_literal_suffix(); self.eat_literal_suffix();
} }
let kind = RawByteStr { n_hashes, err };
let kind = RawByteStr(raw_str_i);
Literal { kind, suffix_start } Literal { kind, suffix_start }
} }
_ => self.ident(), _ => self.ident(),
@ -692,27 +630,34 @@ impl Cursor<'_> {
false false
} }
/// Eats the double-quoted string and returns an `UnvalidatedRawStr`. /// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr { fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u16, Option<RawStrError>) {
// Wrap the actual function to handle the error with too many hashes.
// This way, it eats the whole raw string.
let (n_hashes, err) = self.raw_string_unvalidated(prefix_len);
// Only up to 65535 `#`s are allowed in raw strings
match u16::try_from(n_hashes) {
Ok(num) => (num, err),
// We lie about the number of hashes here :P
Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })),
}
}
fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>) {
debug_assert!(self.prev() == 'r'); debug_assert!(self.prev() == 'r');
let mut valid_start: bool = false;
let start_pos = self.len_consumed(); let start_pos = self.len_consumed();
let (mut possible_terminator_offset, mut max_hashes) = (None, 0); let mut possible_terminator_offset = None;
let mut max_hashes = 0;
// Count opening '#' symbols. // Count opening '#' symbols.
let n_start_hashes = self.eat_while(|c| c == '#'); let n_start_hashes = self.eat_while(|c| c == '#');
// Check that string is started. // Check that string is started.
match self.bump() { match self.bump() {
Some('"') => valid_start = true, Some('"') => (),
_ => { c => {
return UnvalidatedRawStr { let c = c.unwrap_or(EOF_CHAR);
valid_start, return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c }));
valid_end: false,
n_start_hashes,
n_end_hashes: 0,
possible_terminator_offset,
};
} }
} }
@ -722,13 +667,14 @@ impl Cursor<'_> {
self.eat_while(|c| c != '"'); self.eat_while(|c| c != '"');
if self.is_eof() { if self.is_eof() {
return UnvalidatedRawStr { return (
valid_start,
valid_end: false,
n_start_hashes, n_start_hashes,
n_end_hashes: max_hashes, Some(RawStrError::NoTerminator {
possible_terminator_offset, expected: n_start_hashes,
}; found: max_hashes,
possible_terminator_offset,
}),
);
} }
// Eat closing double quote. // Eat closing double quote.
@ -737,7 +683,7 @@ impl Cursor<'_> {
// Check that amount of closing '#' symbols // Check that amount of closing '#' symbols
// is equal to the amount of opening ones. // is equal to the amount of opening ones.
// Note that this will not consume extra trailing `#` characters: // Note that this will not consume extra trailing `#` characters:
// `r###"abcde"####` is lexed as a `LexedRawString { n_hashes: 3 }` // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }`
// followed by a `#` token. // followed by a `#` token.
let mut hashes_left = n_start_hashes; let mut hashes_left = n_start_hashes;
let is_closing_hash = |c| { let is_closing_hash = |c| {
@ -751,13 +697,7 @@ impl Cursor<'_> {
let n_end_hashes = self.eat_while(is_closing_hash); let n_end_hashes = self.eat_while(is_closing_hash);
if n_end_hashes == n_start_hashes { if n_end_hashes == n_start_hashes {
return UnvalidatedRawStr { return (n_start_hashes, None);
valid_start,
valid_end: true,
n_start_hashes,
n_end_hashes,
possible_terminator_offset: None,
};
} else if n_end_hashes > max_hashes { } else if n_end_hashes > max_hashes {
// Keep track of possible terminators to give a hint about // Keep track of possible terminators to give a hint about
// where there might be a missing terminator // where there might be a missing terminator

View file

@ -2,77 +2,37 @@
mod tests { mod tests {
use crate::*; use crate::*;
fn check_raw_str( fn check_raw_str(s: &str, expected_hashes: u16, expected_err: Option<RawStrError>) {
s: &str,
expected: UnvalidatedRawStr,
validated: Result<ValidatedRawStr, LexRawStrError>,
) {
let s = &format!("r{}", s); let s = &format!("r{}", s);
let mut cursor = Cursor::new(s); let mut cursor = Cursor::new(s);
cursor.bump(); cursor.bump();
let tok = cursor.raw_double_quoted_string(0); let (n_hashes, err) = cursor.raw_double_quoted_string(0);
assert_eq!(tok, expected); assert_eq!(n_hashes, expected_hashes);
assert_eq!(tok.validate(), validated); assert_eq!(err, expected_err);
} }
#[test] #[test]
fn test_naked_raw_str() { fn test_naked_raw_str() {
check_raw_str( check_raw_str(r#""abc""#, 0, None);
r#""abc""#,
UnvalidatedRawStr {
n_start_hashes: 0,
n_end_hashes: 0,
valid_start: true,
valid_end: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 0 }),
);
} }
#[test] #[test]
fn test_raw_no_start() { fn test_raw_no_start() {
check_raw_str( check_raw_str(r##""abc"#"##, 0, None);
r##""abc"#"##,
UnvalidatedRawStr {
n_start_hashes: 0,
n_end_hashes: 0,
valid_start: true,
valid_end: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 0 }),
);
} }
#[test] #[test]
fn test_too_many_terminators() { fn test_too_many_terminators() {
// this error is handled in the parser later // this error is handled in the parser later
check_raw_str( check_raw_str(r###"#"abc"##"###, 1, None);
r###"#"abc"##"###,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 1,
valid_end: true,
valid_start: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 1 }),
);
} }
#[test] #[test]
fn test_unterminated() { fn test_unterminated() {
check_raw_str( check_raw_str(
r#"#"abc"#, r#"#"abc"#,
UnvalidatedRawStr { 1,
n_start_hashes: 1, Some(RawStrError::NoTerminator {
n_end_hashes: 0,
valid_end: false,
valid_start: true,
possible_terminator_offset: None,
},
Err(LexRawStrError::NoTerminator {
expected: 1, expected: 1,
found: 0, found: 0,
possible_terminator_offset: None, possible_terminator_offset: None,
@ -80,14 +40,8 @@ mod tests {
); );
check_raw_str( check_raw_str(
r###"##"abc"#"###, r###"##"abc"#"###,
UnvalidatedRawStr { 2,
n_start_hashes: 2, Some(RawStrError::NoTerminator {
n_end_hashes: 1,
valid_start: true,
valid_end: false,
possible_terminator_offset: Some(7),
},
Err(LexRawStrError::NoTerminator {
expected: 2, expected: 2,
found: 1, found: 1,
possible_terminator_offset: Some(7), possible_terminator_offset: Some(7),
@ -96,14 +50,8 @@ mod tests {
// We're looking for "# not just any # // We're looking for "# not just any #
check_raw_str( check_raw_str(
r###"##"abc#"###, r###"##"abc#"###,
UnvalidatedRawStr { 2,
n_start_hashes: 2, Some(RawStrError::NoTerminator {
n_end_hashes: 0,
valid_start: true,
valid_end: false,
possible_terminator_offset: None,
},
Err(LexRawStrError::NoTerminator {
expected: 2, expected: 2,
found: 0, found: 0,
possible_terminator_offset: None, possible_terminator_offset: None,
@ -113,17 +61,7 @@ mod tests {
#[test] #[test]
fn test_invalid_start() { fn test_invalid_start() {
check_raw_str( check_raw_str(r##"#~"abc"#"##, 1, Some(RawStrError::InvalidStarter { bad_char: '~' }));
r##"#~"abc"#"##,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 0,
valid_start: false,
valid_end: false,
possible_terminator_offset: None,
},
Err(LexRawStrError::InvalidStarter),
);
} }
#[test] #[test]
@ -131,14 +69,8 @@ mod tests {
// https://github.com/rust-lang/rust/issues/70677 // https://github.com/rust-lang/rust/issues/70677
check_raw_str( check_raw_str(
r#"""#, r#"""#,
UnvalidatedRawStr { 0,
n_start_hashes: 0, Some(RawStrError::NoTerminator {
n_end_hashes: 0,
valid_start: true,
valid_end: false,
possible_terminator_offset: None,
},
Err(LexRawStrError::NoTerminator {
expected: 0, expected: 0,
found: 0, found: 0,
possible_terminator_offset: None, possible_terminator_offset: None,

View file

@ -3,7 +3,7 @@ use rustc_ast::util::comments;
use rustc_data_structures::sync::Lrc; use rustc_data_structures::sync::Lrc;
use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError}; use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
use rustc_lexer::Base; use rustc_lexer::Base;
use rustc_lexer::{unescape, LexRawStrError, UnvalidatedRawStr, ValidatedRawStr}; use rustc_lexer::{unescape, RawStrError};
use rustc_session::parse::ParseSess; use rustc_session::parse::ParseSess;
use rustc_span::symbol::{sym, Symbol}; use rustc_span::symbol::{sym, Symbol};
use rustc_span::{BytePos, Pos, Span}; use rustc_span::{BytePos, Pos, Span};
@ -359,15 +359,13 @@ impl<'a> StringReader<'a> {
} }
(token::ByteStr, Mode::ByteStr, 2, 1) // b" " (token::ByteStr, Mode::ByteStr, 2, 1) // b" "
} }
rustc_lexer::LiteralKind::RawStr(unvalidated_raw_str) => { rustc_lexer::LiteralKind::RawStr { n_hashes, err } => {
let valid_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str); self.report_raw_str_error(start, err);
let n_hashes = valid_raw_str.num_hashes();
let n = u32::from(n_hashes); let n = u32::from(n_hashes);
(token::StrRaw(n_hashes), Mode::RawStr, 2 + n, 1 + n) // r##" "## (token::StrRaw(n_hashes), Mode::RawStr, 2 + n, 1 + n) // r##" "##
} }
rustc_lexer::LiteralKind::RawByteStr(unvalidated_raw_str) => { rustc_lexer::LiteralKind::RawByteStr { n_hashes, err } => {
let validated_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str); self.report_raw_str_error(start, err);
let n_hashes = validated_raw_str.num_hashes();
let n = u32::from(n_hashes); let n = u32::from(n_hashes);
(token::ByteStrRaw(n_hashes), Mode::RawByteStr, 3 + n, 1 + n) // br##" "## (token::ByteStrRaw(n_hashes), Mode::RawByteStr, 3 + n, 1 + n) // br##" "##
} }
@ -459,28 +457,21 @@ impl<'a> StringReader<'a> {
} }
} }
fn validate_and_report_errors( fn report_raw_str_error(&self, start: BytePos, opt_err: Option<RawStrError>) {
&self, match opt_err {
start: BytePos, Some(RawStrError::InvalidStarter { bad_char }) => {
unvalidated_raw_str: UnvalidatedRawStr, self.report_non_started_raw_string(start, bad_char)
) -> ValidatedRawStr {
match unvalidated_raw_str.validate() {
Err(LexRawStrError::InvalidStarter) => self.report_non_started_raw_string(start),
Err(LexRawStrError::NoTerminator { expected, found, possible_terminator_offset }) => {
self.report_unterminated_raw_string(
start,
expected,
possible_terminator_offset,
found,
)
} }
Err(LexRawStrError::TooManyDelimiters) => self.report_too_many_hashes(start), Some(RawStrError::NoTerminator { expected, found, possible_terminator_offset }) => self
Ok(valid) => valid, .report_unterminated_raw_string(start, expected, possible_terminator_offset, found),
Some(RawStrError::TooManyDelimiters { found }) => {
self.report_too_many_hashes(start, found)
}
None => (),
} }
} }
fn report_non_started_raw_string(&self, start: BytePos) -> ! { fn report_non_started_raw_string(&self, start: BytePos, bad_char: char) -> ! {
let bad_char = self.str_from(start).chars().last().unwrap();
self.struct_fatal_span_char( self.struct_fatal_span_char(
start, start,
self.pos, self.pos,
@ -530,11 +521,17 @@ impl<'a> StringReader<'a> {
FatalError.raise() FatalError.raise()
} }
fn report_too_many_hashes(&self, start: BytePos) -> ! { /// Note: It was decided to not add a test case, because it would be to big.
/// https://github.com/rust-lang/rust/pull/50296#issuecomment-392135180
fn report_too_many_hashes(&self, start: BytePos, found: usize) -> ! {
self.fatal_span_( self.fatal_span_(
start, start,
self.pos, self.pos,
"too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols", &format!(
"too many `#` symbols: raw strings may be delimited \
by up to 65535 `#` symbols, but found {}",
found
),
) )
.raise(); .raise();
} }