1
Fork 0

Auto merge of #118897 - nnethercote:more-unescaping-cleanups, r=fee1-dead

More unescaping cleanups

More minor improvements I found while working on #118699.

r? `@fee1-dead`
This commit is contained in:
bors 2023-12-16 08:52:06 +00:00
commit 3ad8e2d129
3 changed files with 82 additions and 88 deletions

View file

@ -77,6 +77,8 @@ impl LitKind {
// new symbol because the string in the LitKind is different to the // new symbol because the string in the LitKind is different to the
// string in the token. // string in the token.
let s = symbol.as_str(); let s = symbol.as_str();
// Vanilla strings are so common we optimize for the common case where no chars
// requiring special behaviour are present.
let symbol = if s.contains(['\\', '\r']) { let symbol = if s.contains(['\\', '\r']) {
let mut buf = String::with_capacity(s.len()); let mut buf = String::with_capacity(s.len());
let mut error = Ok(()); let mut error = Ok(());
@ -104,27 +106,20 @@ impl LitKind {
LitKind::Str(symbol, ast::StrStyle::Cooked) LitKind::Str(symbol, ast::StrStyle::Cooked)
} }
token::StrRaw(n) => { token::StrRaw(n) => {
// Ditto. // Raw strings have no escapes, so we only need to check for invalid chars, and we
let s = symbol.as_str(); // can reuse the symbol on success.
let symbol = let mut error = Ok(());
if s.contains('\r') { unescape_literal(symbol.as_str(), Mode::RawStr, &mut |_, unescaped_char| {
let mut buf = String::with_capacity(s.len()); match unescaped_char {
let mut error = Ok(()); Ok(_) => {}
unescape_literal(s, Mode::RawStr, &mut |_, unescaped_char| { Err(err) => {
match unescaped_char { if err.is_fatal() {
Ok(c) => buf.push(c), error = Err(LitError::LexerError);
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
}
} }
}); }
error?; }
Symbol::intern(&buf) });
} else { error?;
symbol
};
LitKind::Str(symbol, ast::StrStyle::Raw(n)) LitKind::Str(symbol, ast::StrStyle::Raw(n))
} }
token::ByteStr => { token::ByteStr => {
@ -143,25 +138,19 @@ impl LitKind {
LitKind::ByteStr(buf.into(), StrStyle::Cooked) LitKind::ByteStr(buf.into(), StrStyle::Cooked)
} }
token::ByteStrRaw(n) => { token::ByteStrRaw(n) => {
// Raw strings have no escapes, so we only need to check for invalid chars, and we
// can convert the symbol directly to a `Lrc<u8>` on success.
let s = symbol.as_str(); let s = symbol.as_str();
let bytes = if s.contains('\r') { let mut error = Ok(());
let mut buf = Vec::with_capacity(s.len()); unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c {
let mut error = Ok(()); Ok(_) => {}
unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c { Err(err) => {
Ok(c) => buf.push(byte_from_char(c)), if err.is_fatal() {
Err(err) => { error = Err(LitError::LexerError);
if err.is_fatal() {
error = Err(LitError::LexerError);
}
} }
}); }
error?; });
buf LitKind::ByteStr(s.to_owned().into_bytes().into(), StrStyle::Raw(n))
} else {
symbol.to_string().into_bytes()
};
LitKind::ByteStr(bytes.into(), StrStyle::Raw(n))
} }
token::CStr => { token::CStr => {
let s = symbol.as_str(); let s = symbol.as_str();
@ -172,7 +161,6 @@ impl LitKind {
error = Err(LitError::NulInCStr(span)); error = Err(LitError::NulInCStr(span));
} }
Ok(CStrUnit::Byte(b)) => buf.push(b), Ok(CStrUnit::Byte(b)) => buf.push(b),
Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
Ok(CStrUnit::Char(c)) => { Ok(CStrUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
} }
@ -187,18 +175,15 @@ impl LitKind {
LitKind::CStr(buf.into(), StrStyle::Cooked) LitKind::CStr(buf.into(), StrStyle::Cooked)
} }
token::CStrRaw(n) => { token::CStrRaw(n) => {
// Raw strings have no escapes, so we only need to check for invalid chars, and we
// can convert the symbol directly to a `Lrc<u8>` on success.
let s = symbol.as_str(); let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(()); let mut error = Ok(());
unescape_c_string(s, Mode::RawCStr, &mut |span, c| match c { unescape_c_string(s, Mode::RawCStr, &mut |span, c| match c {
Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => { Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => {
error = Err(LitError::NulInCStr(span)); error = Err(LitError::NulInCStr(span));
} }
Ok(CStrUnit::Byte(b)) => buf.push(b), Ok(_) => {}
Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
Ok(CStrUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
}
Err(err) => { Err(err) => {
if err.is_fatal() { if err.is_fatal() {
error = Err(LitError::LexerError); error = Err(LitError::LexerError);
@ -206,6 +191,7 @@ impl LitKind {
} }
}); });
error?; error?;
let mut buf = s.to_owned().into_bytes();
buf.push(0); buf.push(0);
LitKind::CStr(buf.into(), StrStyle::Raw(n)) LitKind::CStr(buf.into(), StrStyle::Raw(n))
} }

View file

@ -92,8 +92,8 @@ where
let res = unescape_char_or_byte(&mut chars, mode); let res = unescape_char_or_byte(&mut chars, mode);
callback(0..(src.len() - chars.as_str().len()), res); callback(0..(src.len() - chars.as_str().len()), res);
} }
Str | ByteStr => unescape_str_common(src, mode, callback), Str | ByteStr => unescape_non_raw_common(src, mode, callback),
RawStr | RawByteStr => unescape_raw_str_or_raw_byte_str(src, mode, callback), RawStr | RawByteStr => check_raw_common(src, mode, callback),
CStr | RawCStr => unreachable!(), CStr | RawCStr => unreachable!(),
} }
} }
@ -122,12 +122,10 @@ where
{ {
match mode { match mode {
CStr => { CStr => {
unescape_str_common(src, mode, callback); unescape_non_raw_common(src, mode, callback);
} }
RawCStr => { RawCStr => {
unescape_raw_str_or_raw_byte_str(src, mode, &mut |r, result| { check_raw_common(src, mode, &mut |r, result| callback(r, result.map(CStrUnit::Char)));
callback(r, result.map(CStrUnit::Char))
});
} }
Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(), Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(),
} }
@ -191,8 +189,9 @@ impl Mode {
/// Byte literals do not allow unicode escape. /// Byte literals do not allow unicode escape.
fn is_unicode_escape_disallowed(self) -> bool { fn is_unicode_escape_disallowed(self) -> bool {
match self { match self {
Byte | ByteStr | RawByteStr => true, Byte | ByteStr => true,
Char | Str | RawStr | CStr | RawCStr => false, Char | Str | CStr => false,
RawByteStr | RawStr | RawCStr => unreachable!(),
} }
} }
@ -324,7 +323,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
/// Takes a contents of a string literal (without quotes) and produces a /// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors. /// sequence of escaped characters or errors.
fn unescape_str_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F) fn unescape_non_raw_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F)
where where
F: FnMut(Range<usize>, Result<T, EscapeError>), F: FnMut(Range<usize>, Result<T, EscapeError>),
{ {
@ -391,7 +390,7 @@ where
/// sequence of characters or errors. /// sequence of characters or errors.
/// NOTE: Raw strings do not perform any explicit character escaping, here we /// NOTE: Raw strings do not perform any explicit character escaping, here we
/// only produce errors on bare CR. /// only produce errors on bare CR.
fn unescape_raw_str_or_raw_byte_str<F>(src: &str, mode: Mode, callback: &mut F) fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F)
where where
F: FnMut(Range<usize>, Result<char, EscapeError>), F: FnMut(Range<usize>, Result<char, EscapeError>),
{ {
@ -399,7 +398,7 @@ where
let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop
// The `start` and `end` computation here matches the one in // The `start` and `end` computation here matches the one in
// `unescape_str_common` for consistency, even though this function // `unescape_non_raw_common` for consistency, even though this function
// doesn't have to worry about skipping any chars. // doesn't have to worry about skipping any chars.
while let Some(c) = chars.next() { while let Some(c) = chars.next() {
let start = src.len() - chars.as_str().len() - c.len_utf8(); let start = src.len() - chars.as_str().len() - c.len_utf8();

View file

@ -11,12 +11,12 @@ use crate::errors::{MoreThanOneCharNote, MoreThanOneCharSugg, NoBraceUnicodeSub,
pub(crate) fn emit_unescape_error( pub(crate) fn emit_unescape_error(
handler: &Handler, handler: &Handler,
// interior part of the literal, without quotes // interior part of the literal, between quotes
lit: &str, lit: &str,
// full span of the literal, including quotes // full span of the literal, including quotes and any prefix
span_with_quotes: Span, full_lit_span: Span,
// interior span of the literal, without quotes // span of the error part of the literal
span: Span, err_span: Span,
mode: Mode, mode: Mode,
// range of the error inside `lit` // range of the error inside `lit`
range: Range<usize>, range: Range<usize>,
@ -24,19 +24,21 @@ pub(crate) fn emit_unescape_error(
) { ) {
debug!( debug!(
"emit_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}", "emit_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}",
lit, span_with_quotes, mode, range, error lit, full_lit_span, mode, range, error
); );
let last_char = || { let last_char = || {
let c = lit[range.clone()].chars().next_back().unwrap(); let c = lit[range.clone()].chars().next_back().unwrap();
let span = span.with_lo(span.hi() - BytePos(c.len_utf8() as u32)); let span = err_span.with_lo(err_span.hi() - BytePos(c.len_utf8() as u32));
(c, span) (c, span)
}; };
match error { match error {
EscapeError::LoneSurrogateUnicodeEscape => { EscapeError::LoneSurrogateUnicodeEscape => {
handler.emit_err(UnescapeError::InvalidUnicodeEscape { span, surrogate: true }); handler
.emit_err(UnescapeError::InvalidUnicodeEscape { span: err_span, surrogate: true });
} }
EscapeError::OutOfRangeUnicodeEscape => { EscapeError::OutOfRangeUnicodeEscape => {
handler.emit_err(UnescapeError::InvalidUnicodeEscape { span, surrogate: false }); handler
.emit_err(UnescapeError::InvalidUnicodeEscape { span: err_span, surrogate: false });
} }
EscapeError::MoreThanOneChar => { EscapeError::MoreThanOneChar => {
use unicode_normalization::{char::is_combining_mark, UnicodeNormalization}; use unicode_normalization::{char::is_combining_mark, UnicodeNormalization};
@ -49,12 +51,16 @@ pub(crate) fn emit_unescape_error(
let normalized = lit.nfc().to_string(); let normalized = lit.nfc().to_string();
if normalized.chars().count() == 1 { if normalized.chars().count() == 1 {
let ch = normalized.chars().next().unwrap().escape_default().to_string(); let ch = normalized.chars().next().unwrap().escape_default().to_string();
sugg = Some(MoreThanOneCharSugg::NormalizedForm { span, ch, normalized }); sugg = Some(MoreThanOneCharSugg::NormalizedForm {
span: err_span,
ch,
normalized,
});
} }
let escaped_marks = let escaped_marks =
rest.iter().map(|c| c.escape_default().to_string()).collect::<Vec<_>>(); rest.iter().map(|c| c.escape_default().to_string()).collect::<Vec<_>>();
note = Some(MoreThanOneCharNote::AllCombining { note = Some(MoreThanOneCharNote::AllCombining {
span, span: err_span,
chr: format!("{first}"), chr: format!("{first}"),
len: escaped_marks.len(), len: escaped_marks.len(),
escaped_marks: escaped_marks.join(""), escaped_marks: escaped_marks.join(""),
@ -69,10 +75,12 @@ pub(crate) fn emit_unescape_error(
.collect(); .collect();
if let &[ch] = printable.as_slice() { if let &[ch] = printable.as_slice() {
sugg = sugg = Some(MoreThanOneCharSugg::RemoveNonPrinting {
Some(MoreThanOneCharSugg::RemoveNonPrinting { span, ch: ch.to_string() }); span: err_span,
ch: ch.to_string(),
});
note = Some(MoreThanOneCharNote::NonPrinting { note = Some(MoreThanOneCharNote::NonPrinting {
span, span: err_span,
escaped: lit.escape_default().to_string(), escaped: lit.escape_default().to_string(),
}); });
} }
@ -91,13 +99,13 @@ pub(crate) fn emit_unescape_error(
} }
let sugg = format!("{prefix}\"{escaped}\""); let sugg = format!("{prefix}\"{escaped}\"");
MoreThanOneCharSugg::Quotes { MoreThanOneCharSugg::Quotes {
span: span_with_quotes, span: full_lit_span,
is_byte: mode == Mode::Byte, is_byte: mode == Mode::Byte,
sugg, sugg,
} }
}); });
handler.emit_err(UnescapeError::MoreThanOneChar { handler.emit_err(UnescapeError::MoreThanOneChar {
span: span_with_quotes, span: full_lit_span,
note, note,
suggestion: sugg, suggestion: sugg,
}); });
@ -105,7 +113,7 @@ pub(crate) fn emit_unescape_error(
EscapeError::EscapeOnlyChar => { EscapeError::EscapeOnlyChar => {
let (c, char_span) = last_char(); let (c, char_span) = last_char();
handler.emit_err(UnescapeError::EscapeOnlyChar { handler.emit_err(UnescapeError::EscapeOnlyChar {
span, span: err_span,
char_span, char_span,
escaped_sugg: c.escape_default().to_string(), escaped_sugg: c.escape_default().to_string(),
escaped_msg: escaped_char(c), escaped_msg: escaped_char(c),
@ -114,11 +122,11 @@ pub(crate) fn emit_unescape_error(
} }
EscapeError::BareCarriageReturn => { EscapeError::BareCarriageReturn => {
let double_quotes = mode.in_double_quotes(); let double_quotes = mode.in_double_quotes();
handler.emit_err(UnescapeError::BareCr { span, double_quotes }); handler.emit_err(UnescapeError::BareCr { span: err_span, double_quotes });
} }
EscapeError::BareCarriageReturnInRawString => { EscapeError::BareCarriageReturnInRawString => {
assert!(mode.in_double_quotes()); assert!(mode.in_double_quotes());
handler.emit_err(UnescapeError::BareCrRawString(span)); handler.emit_err(UnescapeError::BareCrRawString(err_span));
} }
EscapeError::InvalidEscape => { EscapeError::InvalidEscape => {
let (c, span) = last_char(); let (c, span) = last_char();
@ -143,7 +151,7 @@ pub(crate) fn emit_unescape_error(
} else { } else {
if mode == Mode::Str || mode == Mode::Char { if mode == Mode::Str || mode == Mode::Char {
diag.span_suggestion( diag.span_suggestion(
span_with_quotes, full_lit_span,
"if you meant to write a literal backslash (perhaps escaping in a regular expression), consider a raw string literal", "if you meant to write a literal backslash (perhaps escaping in a regular expression), consider a raw string literal",
format!("r\"{lit}\""), format!("r\"{lit}\""),
Applicability::MaybeIncorrect, Applicability::MaybeIncorrect,
@ -158,7 +166,7 @@ pub(crate) fn emit_unescape_error(
diag.emit(); diag.emit();
} }
EscapeError::TooShortHexEscape => { EscapeError::TooShortHexEscape => {
handler.emit_err(UnescapeError::TooShortHexEscape(span)); handler.emit_err(UnescapeError::TooShortHexEscape(err_span));
} }
EscapeError::InvalidCharInHexEscape | EscapeError::InvalidCharInUnicodeEscape => { EscapeError::InvalidCharInHexEscape | EscapeError::InvalidCharInUnicodeEscape => {
let (c, span) = last_char(); let (c, span) = last_char();
@ -210,7 +218,7 @@ pub(crate) fn emit_unescape_error(
err.emit(); err.emit();
} }
EscapeError::OutOfRangeHexEscape => { EscapeError::OutOfRangeHexEscape => {
handler.emit_err(UnescapeError::OutOfRangeHexEscape(span)); handler.emit_err(UnescapeError::OutOfRangeHexEscape(err_span));
} }
EscapeError::LeadingUnderscoreUnicodeEscape => { EscapeError::LeadingUnderscoreUnicodeEscape => {
let (c, span) = last_char(); let (c, span) = last_char();
@ -220,10 +228,11 @@ pub(crate) fn emit_unescape_error(
}); });
} }
EscapeError::OverlongUnicodeEscape => { EscapeError::OverlongUnicodeEscape => {
handler.emit_err(UnescapeError::OverlongUnicodeEscape(span)); handler.emit_err(UnescapeError::OverlongUnicodeEscape(err_span));
} }
EscapeError::UnclosedUnicodeEscape => { EscapeError::UnclosedUnicodeEscape => {
handler.emit_err(UnescapeError::UnclosedUnicodeEscape(span, span.shrink_to_hi())); handler
.emit_err(UnescapeError::UnclosedUnicodeEscape(err_span, err_span.shrink_to_hi()));
} }
EscapeError::NoBraceInUnicodeEscape => { EscapeError::NoBraceInUnicodeEscape => {
let mut suggestion = "\\u{".to_owned(); let mut suggestion = "\\u{".to_owned();
@ -238,34 +247,34 @@ pub(crate) fn emit_unescape_error(
let (label, sub) = if suggestion_len > 0 { let (label, sub) = if suggestion_len > 0 {
suggestion.push('}'); suggestion.push('}');
let hi = char_span.lo() + BytePos(suggestion_len as u32); let hi = char_span.lo() + BytePos(suggestion_len as u32);
(None, NoBraceUnicodeSub::Suggestion { span: span.with_hi(hi), suggestion }) (None, NoBraceUnicodeSub::Suggestion { span: err_span.with_hi(hi), suggestion })
} else { } else {
(Some(span), NoBraceUnicodeSub::Help) (Some(err_span), NoBraceUnicodeSub::Help)
}; };
handler.emit_err(UnescapeError::NoBraceInUnicodeEscape { span, label, sub }); handler.emit_err(UnescapeError::NoBraceInUnicodeEscape { span: err_span, label, sub });
} }
EscapeError::UnicodeEscapeInByte => { EscapeError::UnicodeEscapeInByte => {
handler.emit_err(UnescapeError::UnicodeEscapeInByte(span)); handler.emit_err(UnescapeError::UnicodeEscapeInByte(err_span));
} }
EscapeError::EmptyUnicodeEscape => { EscapeError::EmptyUnicodeEscape => {
handler.emit_err(UnescapeError::EmptyUnicodeEscape(span)); handler.emit_err(UnescapeError::EmptyUnicodeEscape(err_span));
} }
EscapeError::ZeroChars => { EscapeError::ZeroChars => {
handler.emit_err(UnescapeError::ZeroChars(span)); handler.emit_err(UnescapeError::ZeroChars(err_span));
} }
EscapeError::LoneSlash => { EscapeError::LoneSlash => {
handler.emit_err(UnescapeError::LoneSlash(span)); handler.emit_err(UnescapeError::LoneSlash(err_span));
} }
EscapeError::UnskippedWhitespaceWarning => { EscapeError::UnskippedWhitespaceWarning => {
let (c, char_span) = last_char(); let (c, char_span) = last_char();
handler.emit_warning(UnescapeError::UnskippedWhitespace { handler.emit_warning(UnescapeError::UnskippedWhitespace {
span, span: err_span,
ch: escaped_char(c), ch: escaped_char(c),
char_span, char_span,
}); });
} }
EscapeError::MultipleSkippedLinesWarning => { EscapeError::MultipleSkippedLinesWarning => {
handler.emit_warning(UnescapeError::MultipleSkippedLinesWarning(span)); handler.emit_warning(UnescapeError::MultipleSkippedLinesWarning(err_span));
} }
} }
} }