1
Fork 0

Rollup merge of #120329 - nnethercote:3349-precursors, r=fee1-dead

RFC 3349 precursors

Some cleanups I found while working on RFC 3349 that are worth landing separately.

r? `@fee1-dead`
This commit is contained in:
Matthias Krüger 2024-01-26 14:43:31 +01:00 committed by GitHub
commit 5f1f6176a5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 161 additions and 201 deletions

View file

@ -3,8 +3,7 @@
use crate::ast::{self, LitKind, MetaItemLit, StrStyle}; use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
use crate::token::{self, Token}; use crate::token::{self, Token};
use rustc_lexer::unescape::{ use rustc_lexer::unescape::{
byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit, byte_from_char, unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode,
Mode,
}; };
use rustc_span::symbol::{kw, sym, Symbol}; use rustc_span::symbol::{kw, sym, Symbol};
use rustc_span::Span; use rustc_span::Span;
@ -48,6 +47,9 @@ impl LitKind {
return Err(LitError::InvalidSuffix); return Err(LitError::InvalidSuffix);
} }
// For byte/char/string literals, chars and escapes have already been
// checked in the lexer (in `cook_lexer_literal`). So we can assume all
// chars and escapes are valid here.
Ok(match kind { Ok(match kind {
token::Bool => { token::Bool => {
assert!(symbol.is_bool_lit()); assert!(symbol.is_bool_lit());
@ -56,12 +58,12 @@ impl LitKind {
token::Byte => { token::Byte => {
return unescape_byte(symbol.as_str()) return unescape_byte(symbol.as_str())
.map(LitKind::Byte) .map(LitKind::Byte)
.map_err(|_| LitError::LexerError); .map_err(|_| panic!("failed to unescape byte literal"));
} }
token::Char => { token::Char => {
return unescape_char(symbol.as_str()) return unescape_char(symbol.as_str())
.map(LitKind::Char) .map(LitKind::Char)
.map_err(|_| LitError::LexerError); .map_err(|_| panic!("failed to unescape char literal"));
} }
// There are some valid suffixes for integer and float literals, // There are some valid suffixes for integer and float literals,
@ -77,26 +79,22 @@ impl LitKind {
let s = symbol.as_str(); let s = symbol.as_str();
// Vanilla strings are so common we optimize for the common case where no chars // Vanilla strings are so common we optimize for the common case where no chars
// requiring special behaviour are present. // requiring special behaviour are present.
let symbol = if s.contains(['\\', '\r']) { let symbol = if s.contains('\\') {
let mut buf = String::with_capacity(s.len()); let mut buf = String::with_capacity(s.len());
let mut error = Ok(());
// Force-inlining here is aggressive but the closure is // Force-inlining here is aggressive but the closure is
// called on every char in the string, so it can be // called on every char in the string, so it can be hot in
// hot in programs with many long strings. // programs with many long strings containing escapes.
unescape_literal( unescape_unicode(
s, s,
Mode::Str, Mode::Str,
&mut #[inline(always)] &mut #[inline(always)]
|_, unescaped_char| match unescaped_char { |_, c| match c {
Ok(c) => buf.push(c), Ok(c) => buf.push(c),
Err(err) => { Err(err) => {
if err.is_fatal() { assert!(!err.is_fatal(), "failed to unescape string literal")
error = Err(LitError::LexerError);
}
} }
}, },
); );
error?;
Symbol::intern(&buf) Symbol::intern(&buf)
} else { } else {
symbol symbol
@ -104,86 +102,46 @@ impl LitKind {
LitKind::Str(symbol, ast::StrStyle::Cooked) LitKind::Str(symbol, ast::StrStyle::Cooked)
} }
token::StrRaw(n) => { token::StrRaw(n) => {
// Raw strings have no escapes, so we only need to check for invalid chars, and we // Raw strings have no escapes so no work is needed here.
// can reuse the symbol on success.
let mut error = Ok(());
unescape_literal(symbol.as_str(), Mode::RawStr, &mut |_, unescaped_char| {
match unescaped_char {
Ok(_) => {}
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
}
}
});
error?;
LitKind::Str(symbol, ast::StrStyle::Raw(n)) LitKind::Str(symbol, ast::StrStyle::Raw(n))
} }
token::ByteStr => { token::ByteStr => {
let s = symbol.as_str(); let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len()); let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(()); unescape_unicode(s, Mode::ByteStr, &mut |_, c| match c {
unescape_literal(s, Mode::ByteStr, &mut |_, c| match c {
Ok(c) => buf.push(byte_from_char(c)), Ok(c) => buf.push(byte_from_char(c)),
Err(err) => { Err(err) => {
if err.is_fatal() { assert!(!err.is_fatal(), "failed to unescape string literal")
error = Err(LitError::LexerError);
}
} }
}); });
error?;
LitKind::ByteStr(buf.into(), StrStyle::Cooked) LitKind::ByteStr(buf.into(), StrStyle::Cooked)
} }
token::ByteStrRaw(n) => { token::ByteStrRaw(n) => {
// Raw strings have no escapes, so we only need to check for invalid chars, and we // Raw strings have no escapes so we can convert the symbol
// can convert the symbol directly to a `Lrc<u8>` on success. // directly to a `Lrc<u8>`.
let s = symbol.as_str(); let buf = symbol.as_str().to_owned().into_bytes();
let mut error = Ok(()); LitKind::ByteStr(buf.into(), StrStyle::Raw(n))
unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c {
Ok(_) => {}
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
}
});
LitKind::ByteStr(s.to_owned().into_bytes().into(), StrStyle::Raw(n))
} }
token::CStr => { token::CStr => {
let s = symbol.as_str(); let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len()); let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(()); unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
unescape_c_string(s, Mode::CStr, &mut |_span, c| match c { Ok(MixedUnit::Char(c)) => {
Ok(CStrUnit::Byte(b)) => buf.push(b),
Ok(CStrUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
} }
Ok(MixedUnit::HighByte(b)) => buf.push(b),
Err(err) => { Err(err) => {
if err.is_fatal() { assert!(!err.is_fatal(), "failed to unescape C string literal")
error = Err(LitError::LexerError);
}
} }
}); });
error?;
buf.push(0); buf.push(0);
LitKind::CStr(buf.into(), StrStyle::Cooked) LitKind::CStr(buf.into(), StrStyle::Cooked)
} }
token::CStrRaw(n) => { token::CStrRaw(n) => {
// Raw strings have no escapes, so we only need to check for invalid chars, and we // Raw strings have no escapes so we can convert the symbol
// can convert the symbol directly to a `Lrc<u8>` on success. // directly to a `Lrc<u8>` after appending the terminating NUL
let s = symbol.as_str(); // char.
let mut error = Ok(()); let mut buf = symbol.as_str().to_owned().into_bytes();
unescape_c_string(s, Mode::RawCStr, &mut |_, c| match c {
Ok(_) => {}
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
}
});
error?;
let mut buf = s.to_owned().into_bytes();
buf.push(0); buf.push(0);
LitKind::CStr(buf.into(), StrStyle::Raw(n)) LitKind::CStr(buf.into(), StrStyle::Raw(n))
} }

View file

@ -80,12 +80,12 @@ impl EscapeError {
} }
} }
/// Takes a contents of a literal (without quotes) and produces a sequence of /// Takes the contents of a unicode-only (non-mixed-utf8) literal (without
/// escaped characters or errors. /// quotes) and produces a sequence of escaped characters or errors.
/// ///
/// Values are returned by invoking `callback`. For `Char` and `Byte` modes, /// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
/// the callback will be called exactly once. /// the callback will be called exactly once.
pub fn unescape_literal<F>(src: &str, mode: Mode, callback: &mut F) pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F)
where where
F: FnMut(Range<usize>, Result<char, EscapeError>), F: FnMut(Range<usize>, Result<char, EscapeError>),
{ {
@ -97,50 +97,63 @@ where
} }
Str | ByteStr => unescape_non_raw_common(src, mode, callback), Str | ByteStr => unescape_non_raw_common(src, mode, callback),
RawStr | RawByteStr => check_raw_common(src, mode, callback), RawStr | RawByteStr => check_raw_common(src, mode, callback),
CStr | RawCStr => unreachable!(), RawCStr => check_raw_common(src, mode, &mut |r, mut result| {
}
}
/// A unit within CStr. Must not be a nul character.
pub enum CStrUnit {
Byte(u8),
Char(char),
}
impl From<u8> for CStrUnit {
fn from(value: u8) -> Self {
CStrUnit::Byte(value)
}
}
impl From<char> for CStrUnit {
fn from(value: char) -> Self {
CStrUnit::Char(value)
}
}
pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<CStrUnit, EscapeError>),
{
match mode {
CStr => {
unescape_non_raw_common(src, mode, &mut |r, mut result| {
if let Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) = result {
result = Err(EscapeError::NulInCStr);
}
callback(r, result)
});
}
RawCStr => {
check_raw_common(src, mode, &mut |r, mut result| {
if let Ok('\0') = result { if let Ok('\0') = result {
result = Err(EscapeError::NulInCStr); result = Err(EscapeError::NulInCStr);
} }
callback(r, result.map(CStrUnit::Char)) callback(r, result)
}); }),
CStr => unreachable!(),
} }
Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(), }
/// Used for mixed utf8 string literals, i.e. those that allow both unicode
/// chars and high bytes.
pub enum MixedUnit {
/// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes)
/// and Unicode chars (written directly or via `\u` escapes).
///
/// For example, if '¥' appears in a string it is represented here as
/// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte
/// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]`
Char(char),
/// Used for high bytes (`\x80`..`\xff`).
///
/// For example, if `\xa5` appears in a string it is represented here as
/// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant
/// byte string as the single byte `0xa5`.
HighByte(u8),
}
impl From<char> for MixedUnit {
fn from(c: char) -> Self {
MixedUnit::Char(c)
}
}
impl From<u8> for MixedUnit {
fn from(n: u8) -> Self {
if n.is_ascii() { MixedUnit::Char(n as char) } else { MixedUnit::HighByte(n) }
}
}
/// Takes the contents of a mixed-utf8 literal (without quotes) and produces
/// a sequence of escaped characters or errors.
///
/// Values are returned by invoking `callback`.
pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
{
match mode {
CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| {
if let Ok(MixedUnit::Char('\0')) = result {
result = Err(EscapeError::NulInCStr);
}
callback(r, result)
}),
Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(),
} }
} }
@ -181,29 +194,29 @@ impl Mode {
} }
} }
/// Non-byte literals should have `\xXX` escapes that are within the ASCII range. /// Are `\x80`..`\xff` allowed?
fn ascii_escapes_should_be_ascii(self) -> bool { fn allow_high_bytes(self) -> bool {
match self { match self {
Char | Str => true, Char | Str => false,
Byte | ByteStr | CStr => false, Byte | ByteStr | CStr => true,
RawStr | RawByteStr | RawCStr => unreachable!(), RawStr | RawByteStr | RawCStr => unreachable!(),
} }
} }
/// Whether characters within the literal must be within the ASCII range. /// Are unicode (non-ASCII) chars allowed?
#[inline] #[inline]
fn chars_should_be_ascii(self) -> bool { fn allow_unicode_chars(self) -> bool {
match self { match self {
Byte | ByteStr | RawByteStr => true, Byte | ByteStr | RawByteStr => false,
Char | Str | RawStr | CStr | RawCStr => false, Char | Str | RawStr | CStr | RawCStr => true,
} }
} }
/// Byte literals do not allow unicode escape. /// Are unicode escapes (`\u`) allowed?
fn is_unicode_escape_disallowed(self) -> bool { fn allow_unicode_escapes(self) -> bool {
match self { match self {
Byte | ByteStr => true, Byte | ByteStr => false,
Char | Str | CStr => false, Char | Str | CStr => true,
RawByteStr | RawStr | RawCStr => unreachable!(), RawByteStr | RawStr | RawCStr => unreachable!(),
} }
} }
@ -217,20 +230,19 @@ impl Mode {
} }
} }
fn scan_escape<T: From<u8> + From<char>>( fn scan_escape<T: From<char> + From<u8>>(
chars: &mut Chars<'_>, chars: &mut Chars<'_>,
mode: Mode, mode: Mode,
) -> Result<T, EscapeError> { ) -> Result<T, EscapeError> {
// Previous character was '\\', unescape what follows. // Previous character was '\\', unescape what follows.
let res = match chars.next().ok_or(EscapeError::LoneSlash)? { let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? {
'"' => b'"', '"' => '"',
'n' => b'\n', 'n' => '\n',
'r' => b'\r', 'r' => '\r',
't' => b'\t', 't' => '\t',
'\\' => b'\\', '\\' => '\\',
'\'' => b'\'', '\'' => '\'',
'0' => b'\0', '0' => '\0',
'x' => { 'x' => {
// Parse hexadecimal character code. // Parse hexadecimal character code.
@ -240,25 +252,23 @@ fn scan_escape<T: From<u8> + From<char>>(
let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
let value = hi * 16 + lo; let value = (hi * 16 + lo) as u8;
if mode.ascii_escapes_should_be_ascii() && !is_ascii(value) { return if !mode.allow_high_bytes() && !value.is_ascii() {
return Err(EscapeError::OutOfRangeHexEscape); Err(EscapeError::OutOfRangeHexEscape)
} else {
// This may be a high byte, but that will only happen if `T` is
// `MixedUnit`, because of the `allow_high_bytes` check above.
Ok(T::from(value as u8))
};
} }
'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from),
value as u8
}
'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(Into::into),
_ => return Err(EscapeError::InvalidEscape), _ => return Err(EscapeError::InvalidEscape),
}; };
Ok(res.into()) Ok(T::from(res))
} }
fn scan_unicode( fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> {
chars: &mut Chars<'_>,
is_unicode_escape_disallowed: bool,
) -> Result<char, EscapeError> {
// We've parsed '\u', now we have to parse '{..}'. // We've parsed '\u', now we have to parse '{..}'.
if chars.next() != Some('{') { if chars.next() != Some('{') {
@ -286,7 +296,7 @@ fn scan_unicode(
// Incorrect syntax has higher priority for error reporting // Incorrect syntax has higher priority for error reporting
// than unallowed value for a literal. // than unallowed value for a literal.
if is_unicode_escape_disallowed { if !allow_unicode_escapes {
return Err(EscapeError::UnicodeEscapeInByte); return Err(EscapeError::UnicodeEscapeInByte);
} }
@ -312,12 +322,8 @@ fn scan_unicode(
} }
#[inline] #[inline]
fn ascii_check(c: char, chars_should_be_ascii: bool) -> Result<char, EscapeError> { fn ascii_check(c: char, allow_unicode_chars: bool) -> Result<char, EscapeError> {
if chars_should_be_ascii && !c.is_ascii() { if allow_unicode_chars || c.is_ascii() { Ok(c) } else { Err(EscapeError::NonAsciiCharInByte) }
Err(EscapeError::NonAsciiCharInByte)
} else {
Ok(c)
}
} }
fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
@ -326,7 +332,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
'\\' => scan_escape(chars, mode), '\\' => scan_escape(chars, mode),
'\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn), '\r' => Err(EscapeError::BareCarriageReturn),
_ => ascii_check(c, mode.chars_should_be_ascii()), _ => ascii_check(c, mode.allow_unicode_chars()),
}?; }?;
if chars.next().is_some() { if chars.next().is_some() {
return Err(EscapeError::MoreThanOneChar); return Err(EscapeError::MoreThanOneChar);
@ -336,12 +342,12 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
/// Takes a contents of a string literal (without quotes) and produces a /// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors. /// sequence of escaped characters or errors.
fn unescape_non_raw_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F) fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F)
where where
F: FnMut(Range<usize>, Result<T, EscapeError>), F: FnMut(Range<usize>, Result<T, EscapeError>),
{ {
let mut chars = src.chars(); let mut chars = src.chars();
let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
// The `start` and `end` computation here is complicated because // The `start` and `end` computation here is complicated because
// `skip_ascii_whitespace` makes us to skip over chars without counting // `skip_ascii_whitespace` makes us to skip over chars without counting
@ -366,7 +372,7 @@ where
} }
'"' => Err(EscapeError::EscapeOnlyChar), '"' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn), '\r' => Err(EscapeError::BareCarriageReturn),
_ => ascii_check(c, chars_should_be_ascii).map(Into::into), _ => ascii_check(c, allow_unicode_chars).map(T::from),
}; };
let end = src.len() - chars.as_str().len(); let end = src.len() - chars.as_str().len();
callback(start..end, res); callback(start..end, res);
@ -408,7 +414,7 @@ where
F: FnMut(Range<usize>, Result<char, EscapeError>), F: FnMut(Range<usize>, Result<char, EscapeError>),
{ {
let mut chars = src.chars(); let mut chars = src.chars();
let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
// The `start` and `end` computation here matches the one in // The `start` and `end` computation here matches the one in
// `unescape_non_raw_common` for consistency, even though this function // `unescape_non_raw_common` for consistency, even though this function
@ -417,7 +423,7 @@ where
let start = src.len() - chars.as_str().len() - c.len_utf8(); let start = src.len() - chars.as_str().len() - c.len_utf8();
let res = match c { let res = match c {
'\r' => Err(EscapeError::BareCarriageReturnInRawString), '\r' => Err(EscapeError::BareCarriageReturnInRawString),
_ => ascii_check(c, chars_should_be_ascii), _ => ascii_check(c, allow_unicode_chars),
}; };
let end = src.len() - chars.as_str().len(); let end = src.len() - chars.as_str().len();
callback(start..end, res); callback(start..end, res);
@ -430,7 +436,3 @@ pub fn byte_from_char(c: char) -> u8 {
debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr"); debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
res as u8 res as u8
} }
fn is_ascii(x: u32) -> bool {
x <= 0x7F
}

View file

@ -100,7 +100,7 @@ fn test_unescape_char_good() {
fn test_unescape_str_warn() { fn test_unescape_str_warn() {
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) { fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
let mut unescaped = Vec::with_capacity(literal.len()); let mut unescaped = Vec::with_capacity(literal.len());
unescape_literal(literal, Mode::Str, &mut |range, res| unescaped.push((range, res))); unescape_unicode(literal, Mode::Str, &mut |range, res| unescaped.push((range, res)));
assert_eq!(unescaped, expected); assert_eq!(unescaped, expected);
} }
@ -124,7 +124,7 @@ fn test_unescape_str_warn() {
fn test_unescape_str_good() { fn test_unescape_str_good() {
fn check(literal_text: &str, expected: &str) { fn check(literal_text: &str, expected: &str) {
let mut buf = Ok(String::with_capacity(literal_text.len())); let mut buf = Ok(String::with_capacity(literal_text.len()));
unescape_literal(literal_text, Mode::Str, &mut |range, c| { unescape_unicode(literal_text, Mode::Str, &mut |range, c| {
if let Ok(b) = &mut buf { if let Ok(b) = &mut buf {
match c { match c {
Ok(c) => b.push(c), Ok(c) => b.push(c),
@ -241,7 +241,7 @@ fn test_unescape_byte_good() {
fn test_unescape_byte_str_good() { fn test_unescape_byte_str_good() {
fn check(literal_text: &str, expected: &[u8]) { fn check(literal_text: &str, expected: &[u8]) {
let mut buf = Ok(Vec::with_capacity(literal_text.len())); let mut buf = Ok(Vec::with_capacity(literal_text.len()));
unescape_literal(literal_text, Mode::ByteStr, &mut |range, c| { unescape_unicode(literal_text, Mode::ByteStr, &mut |range, c| {
if let Ok(b) = &mut buf { if let Ok(b) = &mut buf {
match c { match c {
Ok(c) => b.push(byte_from_char(c)), Ok(c) => b.push(byte_from_char(c)),
@ -264,7 +264,7 @@ fn test_unescape_byte_str_good() {
fn test_unescape_raw_str() { fn test_unescape_raw_str() {
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) { fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
let mut unescaped = Vec::with_capacity(literal.len()); let mut unescaped = Vec::with_capacity(literal.len());
unescape_literal(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res))); unescape_unicode(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res)));
assert_eq!(unescaped, expected); assert_eq!(unescaped, expected);
} }
@ -276,7 +276,7 @@ fn test_unescape_raw_str() {
fn test_unescape_raw_byte_str() { fn test_unescape_raw_byte_str() {
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) { fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
let mut unescaped = Vec::with_capacity(literal.len()); let mut unescaped = Vec::with_capacity(literal.len());
unescape_literal(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res))); unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res)));
assert_eq!(unescaped, expected); assert_eq!(unescaped, expected);
} }

View file

@ -400,7 +400,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
.with_code(error_code!(E0762)) .with_code(error_code!(E0762))
.emit() .emit()
} }
self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' ' self.cook_unicode(token::Char, Mode::Char, start, end, 1, 1) // ' '
} }
rustc_lexer::LiteralKind::Byte { terminated } => { rustc_lexer::LiteralKind::Byte { terminated } => {
if !terminated { if !terminated {
@ -412,7 +412,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
.with_code(error_code!(E0763)) .with_code(error_code!(E0763))
.emit() .emit()
} }
self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' ' self.cook_unicode(token::Byte, Mode::Byte, start, end, 2, 1) // b' '
} }
rustc_lexer::LiteralKind::Str { terminated } => { rustc_lexer::LiteralKind::Str { terminated } => {
if !terminated { if !terminated {
@ -424,7 +424,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
.with_code(error_code!(E0765)) .with_code(error_code!(E0765))
.emit() .emit()
} }
self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " " self.cook_unicode(token::Str, Mode::Str, start, end, 1, 1) // " "
} }
rustc_lexer::LiteralKind::ByteStr { terminated } => { rustc_lexer::LiteralKind::ByteStr { terminated } => {
if !terminated { if !terminated {
@ -436,7 +436,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
.with_code(error_code!(E0766)) .with_code(error_code!(E0766))
.emit() .emit()
} }
self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" " self.cook_unicode(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" "
} }
rustc_lexer::LiteralKind::CStr { terminated } => { rustc_lexer::LiteralKind::CStr { terminated } => {
if !terminated { if !terminated {
@ -448,13 +448,13 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
.with_code(error_code!(E0767)) .with_code(error_code!(E0767))
.emit() .emit()
} }
self.cook_c_string(token::CStr, Mode::CStr, start, end, 2, 1) // c" " self.cook_mixed(token::CStr, Mode::CStr, start, end, 2, 1) // c" "
} }
rustc_lexer::LiteralKind::RawStr { n_hashes } => { rustc_lexer::LiteralKind::RawStr { n_hashes } => {
if let Some(n_hashes) = n_hashes { if let Some(n_hashes) = n_hashes {
let n = u32::from(n_hashes); let n = u32::from(n_hashes);
let kind = token::StrRaw(n_hashes); let kind = token::StrRaw(n_hashes);
self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "## self.cook_unicode(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "##
} else { } else {
self.report_raw_str_error(start, 1); self.report_raw_str_error(start, 1);
} }
@ -463,7 +463,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
if let Some(n_hashes) = n_hashes { if let Some(n_hashes) = n_hashes {
let n = u32::from(n_hashes); let n = u32::from(n_hashes);
let kind = token::ByteStrRaw(n_hashes); let kind = token::ByteStrRaw(n_hashes);
self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "## self.cook_unicode(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "##
} else { } else {
self.report_raw_str_error(start, 2); self.report_raw_str_error(start, 2);
} }
@ -472,7 +472,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
if let Some(n_hashes) = n_hashes { if let Some(n_hashes) = n_hashes {
let n = u32::from(n_hashes); let n = u32::from(n_hashes);
let kind = token::CStrRaw(n_hashes); let kind = token::CStrRaw(n_hashes);
self.cook_c_string(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "## self.cook_unicode(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "##
} else { } else {
self.report_raw_str_error(start, 2); self.report_raw_str_error(start, 2);
} }
@ -735,7 +735,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
} }
} }
fn cook_quoted( fn cook_unicode(
&self, &self,
kind: token::LitKind, kind: token::LitKind,
mode: Mode, mode: Mode,
@ -745,13 +745,13 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
postfix_len: u32, postfix_len: u32,
) -> (token::LitKind, Symbol) { ) -> (token::LitKind, Symbol) {
self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| { self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
unescape::unescape_literal(src, mode, &mut |span, result| { unescape::unescape_unicode(src, mode, &mut |span, result| {
callback(span, result.map(drop)) callback(span, result.map(drop))
}) })
}) })
} }
fn cook_c_string( fn cook_mixed(
&self, &self,
kind: token::LitKind, kind: token::LitKind,
mode: Mode, mode: Mode,
@ -761,7 +761,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
postfix_len: u32, postfix_len: u32,
) -> (token::LitKind, Symbol) { ) -> (token::LitKind, Symbol) {
self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| { self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
unescape::unescape_c_string(src, mode, &mut |span, result| { unescape::unescape_mixed(src, mode, &mut |span, result| {
callback(span, result.map(drop)) callback(span, result.map(drop))
}) })
}) })

View file

@ -1056,7 +1056,7 @@ fn find_width_map_from_snippet(
fn unescape_string(string: &str) -> Option<string::String> { fn unescape_string(string: &str) -> Option<string::String> {
let mut buf = string::String::new(); let mut buf = string::String::new();
let mut ok = true; let mut ok = true;
unescape::unescape_literal(string, unescape::Mode::Str, &mut |_, unescaped_char| { unescape::unescape_unicode(string, unescape::Mode::Str, &mut |_, unescaped_char| {
match unescaped_char { match unescaped_char {
Ok(c) => buf.push(c), Ok(c) => buf.push(c),
Err(_) => ok = false, Err(_) => ok = false,

View file

@ -928,7 +928,7 @@ fn remove_line_splices(s: &str) -> String {
.and_then(|s| s.strip_suffix('"')) .and_then(|s| s.strip_suffix('"'))
.unwrap_or_else(|| panic!("expected quoted string, found `{s}`")); .unwrap_or_else(|| panic!("expected quoted string, found `{s}`"));
let mut res = String::with_capacity(s.len()); let mut res = String::with_capacity(s.len());
unescape::unescape_literal(s, unescape::Mode::Str, &mut |range, ch| { unescape::unescape_unicode(s, unescape::Mode::Str, &mut |range, ch| {
if ch.is_ok() { if ch.is_ok() {
res.push_str(&s[range]); res.push_str(&s[range]);
} }

View file

@ -379,14 +379,14 @@ fn unescape_string_error_message(text: &str, mode: Mode) -> &'static str {
let mut error_message = ""; let mut error_message = "";
match mode { match mode {
Mode::CStr => { Mode::CStr => {
rustc_lexer::unescape::unescape_c_string(text, mode, &mut |_, res| { rustc_lexer::unescape::unescape_mixed(text, mode, &mut |_, res| {
if let Err(e) = res { if let Err(e) = res {
error_message = error_to_diagnostic_message(e, mode); error_message = error_to_diagnostic_message(e, mode);
} }
}); });
} }
Mode::ByteStr | Mode::Str => { Mode::ByteStr | Mode::Str => {
rustc_lexer::unescape::unescape_literal(text, mode, &mut |_, res| { rustc_lexer::unescape::unescape_unicode(text, mode, &mut |_, res| {
if let Err(e) = res { if let Err(e) = res {
error_message = error_to_diagnostic_message(e, mode); error_message = error_to_diagnostic_message(e, mode);
} }

View file

@ -6,7 +6,7 @@ use std::{
}; };
use rustc_lexer::unescape::{ use rustc_lexer::unescape::{
unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit, Mode, unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode,
}; };
use crate::{ use crate::{
@ -193,7 +193,7 @@ pub trait IsString: AstToken {
let text = &self.text()[text_range_no_quotes - start]; let text = &self.text()[text_range_no_quotes - start];
let offset = text_range_no_quotes.start() - start; let offset = text_range_no_quotes.start() - start;
unescape_literal(text, Self::MODE, &mut |range, unescaped_char| { unescape_unicode(text, Self::MODE, &mut |range, unescaped_char| {
let text_range = let text_range =
TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap()); TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap());
cb(text_range + offset, unescaped_char); cb(text_range + offset, unescaped_char);
@ -226,7 +226,7 @@ impl ast::String {
let mut buf = String::new(); let mut buf = String::new();
let mut prev_end = 0; let mut prev_end = 0;
let mut has_error = false; let mut has_error = false;
unescape_literal(text, Self::MODE, &mut |char_range, unescaped_char| match ( unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match (
unescaped_char, unescaped_char,
buf.capacity() == 0, buf.capacity() == 0,
) { ) {
@ -270,7 +270,7 @@ impl ast::ByteString {
let mut buf: Vec<u8> = Vec::new(); let mut buf: Vec<u8> = Vec::new();
let mut prev_end = 0; let mut prev_end = 0;
let mut has_error = false; let mut has_error = false;
unescape_literal(text, Self::MODE, &mut |char_range, unescaped_char| match ( unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match (
unescaped_char, unescaped_char,
buf.capacity() == 0, buf.capacity() == 0,
) { ) {
@ -311,7 +311,7 @@ impl IsString for ast::CString {
let text = &self.text()[text_range_no_quotes - start]; let text = &self.text()[text_range_no_quotes - start];
let offset = text_range_no_quotes.start() - start; let offset = text_range_no_quotes.start() - start;
unescape_c_string(text, Self::MODE, &mut |range, unescaped_char| { unescape_mixed(text, Self::MODE, &mut |range, unescaped_char| {
let text_range = let text_range =
TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap()); TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap());
// XXX: This method should only be used for highlighting ranges. The unescaped // XXX: This method should only be used for highlighting ranges. The unescaped
@ -336,12 +336,11 @@ impl ast::CString {
let mut buf = Vec::new(); let mut buf = Vec::new();
let mut prev_end = 0; let mut prev_end = 0;
let mut has_error = false; let mut has_error = false;
let mut char_buf = [0u8; 4]; let extend_unit = |buf: &mut Vec<u8>, unit: MixedUnit| match unit {
let mut extend_unit = |buf: &mut Vec<u8>, unit: CStrUnit| match unit { MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()),
CStrUnit::Byte(b) => buf.push(b), MixedUnit::HighByte(b) => buf.push(b),
CStrUnit::Char(c) => buf.extend(c.encode_utf8(&mut char_buf).as_bytes()),
}; };
unescape_c_string(text, Self::MODE, &mut |char_range, unescaped| match ( unescape_mixed(text, Self::MODE, &mut |char_range, unescaped| match (
unescaped, unescaped,
buf.capacity() == 0, buf.capacity() == 0,
) { ) {

View file

@ -5,7 +5,7 @@
mod block; mod block;
use rowan::Direction; use rowan::Direction;
use rustc_lexer::unescape::{self, unescape_literal, Mode}; use rustc_lexer::unescape::{self, unescape_mixed, unescape_unicode, Mode};
use crate::{ use crate::{
algo, algo,
@ -140,7 +140,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
ast::LiteralKind::String(s) => { ast::LiteralKind::String(s) => {
if !s.is_raw() { if !s.is_raw() {
if let Some(without_quotes) = unquote(text, 1, '"') { if let Some(without_quotes) = unquote(text, 1, '"') {
unescape_literal(without_quotes, Mode::Str, &mut |range, char| { unescape_unicode(without_quotes, Mode::Str, &mut |range, char| {
if let Err(err) = char { if let Err(err) = char {
push_err(1, range.start, err); push_err(1, range.start, err);
} }
@ -151,7 +151,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
ast::LiteralKind::ByteString(s) => { ast::LiteralKind::ByteString(s) => {
if !s.is_raw() { if !s.is_raw() {
if let Some(without_quotes) = unquote(text, 2, '"') { if let Some(without_quotes) = unquote(text, 2, '"') {
unescape_literal(without_quotes, Mode::ByteStr, &mut |range, char| { unescape_unicode(without_quotes, Mode::ByteStr, &mut |range, char| {
if let Err(err) = char { if let Err(err) = char {
push_err(1, range.start, err); push_err(1, range.start, err);
} }
@ -162,7 +162,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
ast::LiteralKind::CString(s) => { ast::LiteralKind::CString(s) => {
if !s.is_raw() { if !s.is_raw() {
if let Some(without_quotes) = unquote(text, 2, '"') { if let Some(without_quotes) = unquote(text, 2, '"') {
unescape_literal(without_quotes, Mode::ByteStr, &mut |range, char| { unescape_mixed(without_quotes, Mode::CStr, &mut |range, char| {
if let Err(err) = char { if let Err(err) = char {
push_err(1, range.start, err); push_err(1, range.start, err);
} }
@ -172,7 +172,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
} }
ast::LiteralKind::Char(_) => { ast::LiteralKind::Char(_) => {
if let Some(without_quotes) = unquote(text, 1, '\'') { if let Some(without_quotes) = unquote(text, 1, '\'') {
unescape_literal(without_quotes, Mode::Char, &mut |range, char| { unescape_unicode(without_quotes, Mode::Char, &mut |range, char| {
if let Err(err) = char { if let Err(err) = char {
push_err(1, range.start, err); push_err(1, range.start, err);
} }
@ -181,7 +181,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
} }
ast::LiteralKind::Byte(_) => { ast::LiteralKind::Byte(_) => {
if let Some(without_quotes) = unquote(text, 2, '\'') { if let Some(without_quotes) = unquote(text, 2, '\'') {
unescape_literal(without_quotes, Mode::Byte, &mut |range, char| { unescape_unicode(without_quotes, Mode::Byte, &mut |range, char| {
if let Err(err) = char { if let Err(err) = char {
push_err(2, range.start, err); push_err(2, range.start, err);
} }

View file

@ -1,5 +1,6 @@
// check-pass // check-pass
// ignore-tidy-tab // ignore-tidy-tab
// edition: 2021
fn main() { fn main() {
let s = "\ let s = "\
@ -8,11 +9,11 @@ fn main() {
//~^^^ WARNING multiple lines skipped by escaped newline //~^^^ WARNING multiple lines skipped by escaped newline
assert_eq!(s, ""); assert_eq!(s, "");
let s = "foo\ let s = c"foo\
  bar   bar
"; ";
//~^^^ WARNING whitespace symbol '\u{a0}' is not skipped //~^^^ WARNING whitespace symbol '\u{a0}' is not skipped
assert_eq!(s, "foo  bar\n "); assert_eq!(s, c"foo  bar\n ");
let s = "a\ let s = "a\
b"; b";
@ -22,10 +23,10 @@ fn main() {
b"; b";
assert_eq!(s, "ab"); assert_eq!(s, "ab");
let s = "a\ let s = b"a\
b"; b";
//~^^ WARNING whitespace symbol '\u{c}' is not skipped //~^^ WARNING whitespace symbol '\u{c}' is not skipped
// '\x0c' is ASCII whitespace, but it may not need skipped // '\x0c' is ASCII whitespace, but it may not need skipped
// discussion: https://github.com/rust-lang/rust/pull/108403 // discussion: https://github.com/rust-lang/rust/pull/108403
assert_eq!(s, "a\x0cb"); assert_eq!(s, b"a\x0cb");
} }

View file

@ -1,5 +1,5 @@
warning: multiple lines skipped by escaped newline warning: multiple lines skipped by escaped newline
--> $DIR/str-escape.rs:5:14 --> $DIR/str-escape.rs:6:14
| |
LL | let s = "\ LL | let s = "\
| ______________^ | ______________^
@ -8,20 +8,20 @@ LL | | ";
| |_____________^ skipping everything up to and including this point | |_____________^ skipping everything up to and including this point
warning: whitespace symbol '\u{a0}' is not skipped warning: whitespace symbol '\u{a0}' is not skipped
--> $DIR/str-escape.rs:11:17 --> $DIR/str-escape.rs:12:18
| |
LL | let s = "foo\ LL | let s = c"foo\
| _________________^ | __________________^
LL | |   bar LL | |   bar
| | ^ whitespace symbol '\u{a0}' is not skipped | | ^ whitespace symbol '\u{a0}' is not skipped
| |___| | |___|
| |
warning: whitespace symbol '\u{c}' is not skipped warning: whitespace symbol '\u{c}' is not skipped
--> $DIR/str-escape.rs:25:15 --> $DIR/str-escape.rs:26:16
| |
LL | let s = "a\ LL | let s = b"a\
| _______________^ | ________________^
LL | | b"; LL | | b";
| | ^- whitespace symbol '\u{c}' is not skipped | | ^- whitespace symbol '\u{c}' is not skipped
| |____| | |____|