1
Fork 0

Rollup merge of #94316 - nnethercote:improve-string-literal-unescaping, r=petrochenkov

Improve string literal unescaping

Some easy wins that affect a few popular crates.

r? ```@matklad```
This commit is contained in:
Dylan DPC 2022-02-24 21:42:18 +01:00 committed by GitHub
commit ec44d48ae3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 45 additions and 40 deletions

View file

@ -16,6 +16,7 @@
#![feature(min_specialization)] #![feature(min_specialization)]
#![recursion_limit = "256"] #![recursion_limit = "256"]
#![feature(slice_internals)] #![feature(slice_internals)]
#![feature(stmt_expr_attributes)]
#[macro_use] #[macro_use]
extern crate rustc_macros; extern crate rustc_macros;

View file

@ -56,25 +56,30 @@ impl LitKind {
// new symbol because the string in the LitKind is different to the // new symbol because the string in the LitKind is different to the
// string in the token. // string in the token.
let s = symbol.as_str(); let s = symbol.as_str();
let symbol = let symbol = if s.contains(&['\\', '\r']) {
if s.contains(&['\\', '\r']) { let mut buf = String::with_capacity(s.len());
let mut buf = String::with_capacity(s.len()); let mut error = Ok(());
let mut error = Ok(()); // Force-inlining here is aggressive but the closure is
unescape_literal(&s, Mode::Str, &mut |_, unescaped_char| { // called on every char in the string, so it can be
match unescaped_char { // hot in programs with many long strings.
Ok(c) => buf.push(c), unescape_literal(
Err(err) => { &s,
if err.is_fatal() { Mode::Str,
error = Err(LitError::LexerError); &mut #[inline(always)]
} |_, unescaped_char| match unescaped_char {
Ok(c) => buf.push(c),
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
} }
} }
}); },
error?; );
Symbol::intern(&buf) error?;
} else { Symbol::intern(&buf)
symbol } else {
}; symbol
};
LitKind::Str(symbol, ast::StrStyle::Cooked) LitKind::Str(symbol, ast::StrStyle::Cooked)
} }
token::StrRaw(n) => { token::StrRaw(n) => {

View file

@ -159,26 +159,8 @@ impl Mode {
} }
} }
fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
if first_char != '\\' { // Previous character was '\\', unescape what follows.
// Previous character was not a slash, and we don't expect it to be
// an escape-only character.
return match first_char {
'\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn),
'\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
'"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
_ => {
if mode.is_bytes() && !first_char.is_ascii() {
// Byte literal can't be a non-ascii character.
return Err(EscapeError::NonAsciiCharInByte);
}
Ok(first_char)
}
};
}
// Previous character is '\\', try to unescape it.
let second_char = chars.next().ok_or(EscapeError::LoneSlash)?; let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
@ -270,9 +252,24 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
Ok(res) Ok(res)
} }
#[inline]
fn ascii_check(first_char: char, mode: Mode) -> Result<char, EscapeError> {
if mode.is_bytes() && !first_char.is_ascii() {
// Byte literal can't be a non-ascii character.
Err(EscapeError::NonAsciiCharInByte)
} else {
Ok(first_char)
}
}
fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
let first_char = chars.next().ok_or(EscapeError::ZeroChars)?; let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
let res = scan_escape(first_char, chars, mode)?; let res = match first_char {
'\\' => scan_escape(chars, mode),
'\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn),
_ => ascii_check(first_char, mode),
}?;
if chars.next().is_some() { if chars.next().is_some() {
return Err(EscapeError::MoreThanOneChar); return Err(EscapeError::MoreThanOneChar);
} }
@ -303,12 +300,14 @@ where
skip_ascii_whitespace(&mut chars, start, callback); skip_ascii_whitespace(&mut chars, start, callback);
continue; continue;
} }
_ => scan_escape(first_char, &mut chars, mode), _ => scan_escape(&mut chars, mode),
} }
} }
'\n' => Ok('\n'), '\n' => Ok('\n'),
'\t' => Ok('\t'), '\t' => Ok('\t'),
_ => scan_escape(first_char, &mut chars, mode), '"' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn),
_ => ascii_check(first_char, mode),
}; };
let end = initial_len - chars.as_str().len(); let end = initial_len - chars.as_str().len();
callback(start..end, unescaped_char); callback(start..end, unescaped_char);