1
Fork 0

refactor unescape

This commit is contained in:
Deadbeef 2023-03-06 14:14:55 +00:00
parent d5e7206ca6
commit 4c01d494b8
2 changed files with 75 additions and 38 deletions

View file

@ -86,7 +86,8 @@ where
let res = unescape_char_or_byte(&mut chars, mode == Mode::Byte); let res = unescape_char_or_byte(&mut chars, mode == Mode::Byte);
callback(0..(src.len() - chars.as_str().len()), res); callback(0..(src.len() - chars.as_str().len()), res);
} }
Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(src, mode == Mode::ByteStr, callback), Mode::Str | Mode::ByteStr => unescape_str_common(src, mode, callback),
Mode::RawStr | Mode::RawByteStr => { Mode::RawStr | Mode::RawByteStr => {
unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback) unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback)
} }
@ -94,6 +95,7 @@ where
} }
} }
/// A unit within CStr. Must not be a nul character.
pub enum CStrUnit { pub enum CStrUnit {
Byte(u8), Byte(u8),
Char(char), Char(char),
@ -164,24 +166,52 @@ impl Mode {
} }
} }
pub fn is_byte(self) -> bool { /// Non-byte literals should have `\xXX` escapes that are within the ASCII range.
pub fn ascii_escapes_should_be_ascii(self) -> bool {
match self { match self {
Mode::Byte | Mode::ByteStr | Mode::RawByteStr | Mode::CStr | Mode::RawCStr => true, Mode::Char | Mode::Str | Mode::RawStr => true,
Mode::Char | Mode::Str | Mode::RawStr => false, Mode::Byte | Mode::ByteStr | Mode::RawByteStr | Mode::CStr | Mode::RawCStr => false,
}
}
/// Whether characters within the literal must be within the ASCII range
pub fn characters_should_be_ascii(self) -> bool {
match self {
Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false,
}
}
/// Byte literals do not allow unicode escape.
pub fn is_unicode_escape_disallowed(self) -> bool {
match self {
Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false,
}
}
pub fn prefix_noraw(self) -> &'static str {
match self {
Mode::Byte | Mode::ByteStr | Mode::RawByteStr => "b",
Mode::CStr | Mode::RawCStr => "c",
Mode::Char | Mode::Str | Mode::RawStr => "",
} }
} }
} }
fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> { fn scan_escape<T: From<u8> + From<char>>(
chars: &mut Chars<'_>,
mode: Mode,
) -> Result<T, EscapeError> {
// Previous character was '\\', unescape what follows. // Previous character was '\\', unescape what follows.
let res = match chars.next().ok_or(EscapeError::LoneSlash)? { let res = match chars.next().ok_or(EscapeError::LoneSlash)? {
'"' => '"', '"' => b'"',
'n' => '\n', 'n' => b'\n',
'r' => '\r', 'r' => b'\r',
't' => '\t', 't' => b'\t',
'\\' => '\\', '\\' => b'\\',
'\'' => '\'', '\'' => b'\'',
'0' => '\0', '0' => b'\0',
'x' => { 'x' => {
// Parse hexadecimal character code. // Parse hexadecimal character code.
@ -194,22 +224,23 @@ fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError
let value = hi * 16 + lo; let value = hi * 16 + lo;
// For a non-byte literal verify that it is within ASCII range. if mode.ascii_escapes_should_be_ascii() && !is_ascii(value) {
if !is_byte && !is_ascii(value) {
return Err(EscapeError::OutOfRangeHexEscape); return Err(EscapeError::OutOfRangeHexEscape);
} }
let value = value as u8;
value as char value as u8
} }
'u' => scan_unicode(chars, is_byte)?, 'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(Into::into),
_ => return Err(EscapeError::InvalidEscape), _ => return Err(EscapeError::InvalidEscape),
}; };
Ok(res) Ok(res.into())
} }
fn scan_unicode(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> { fn scan_unicode(
chars: &mut Chars<'_>,
is_unicode_escape_disallowed: bool,
) -> Result<char, EscapeError> {
// We've parsed '\u', now we have to parse '{..}'. // We've parsed '\u', now we have to parse '{..}'.
if chars.next() != Some('{') { if chars.next() != Some('{') {
@ -237,7 +268,7 @@ fn scan_unicode(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeErro
// Incorrect syntax has higher priority for error reporting // Incorrect syntax has higher priority for error reporting
// than unallowed value for a literal. // than unallowed value for a literal.
if is_byte { if is_unicode_escape_disallowed {
return Err(EscapeError::UnicodeEscapeInByte); return Err(EscapeError::UnicodeEscapeInByte);
} }
@ -263,8 +294,8 @@ fn scan_unicode(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeErro
} }
#[inline] #[inline]
fn ascii_check(c: char, is_byte: bool) -> Result<char, EscapeError> { fn ascii_check(c: char, characters_should_be_ascii: bool) -> Result<char, EscapeError> {
if is_byte && !c.is_ascii() { if characters_should_be_ascii && !c.is_ascii() {
// Byte literal can't be a non-ascii character. // Byte literal can't be a non-ascii character.
Err(EscapeError::NonAsciiCharInByte) Err(EscapeError::NonAsciiCharInByte)
} else { } else {
@ -275,7 +306,7 @@ fn ascii_check(c: char, is_byte: bool) -> Result<char, EscapeError> {
fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> { fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> {
let c = chars.next().ok_or(EscapeError::ZeroChars)?; let c = chars.next().ok_or(EscapeError::ZeroChars)?;
let res = match c { let res = match c {
'\\' => scan_escape(chars, is_byte), '\\' => scan_escape(chars, if is_byte { Mode::Byte } else { Mode::Char }),
'\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn), '\r' => Err(EscapeError::BareCarriageReturn),
_ => ascii_check(c, is_byte), _ => ascii_check(c, is_byte),
@ -288,9 +319,9 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, E
/// Takes a contents of a string literal (without quotes) and produces a /// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors. /// sequence of escaped characters or errors.
fn unescape_str_or_byte_str<F>(src: &str, is_byte: bool, callback: &mut F) fn unescape_str_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F)
where where
F: FnMut(Range<usize>, Result<char, EscapeError>), F: FnMut(Range<usize>, Result<T, EscapeError>),
{ {
let mut chars = src.chars(); let mut chars = src.chars();
@ -312,17 +343,17 @@ where
}); });
continue; continue;
} }
_ => scan_escape(&mut chars, is_byte), _ => scan_escape::<T>(&mut chars, mode),
} }
} }
'\n' => Ok('\n'), '\n' => Ok(b'\n'.into()),
'\t' => Ok('\t'), '\t' => Ok(b'\t'.into()),
'"' => Err(EscapeError::EscapeOnlyChar), '"' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn), '\r' => Err(EscapeError::BareCarriageReturn),
_ => ascii_check(c, is_byte), _ => ascii_check(c, mode.characters_should_be_ascii()).map(Into::into),
}; };
let end = src.len() - chars.as_str().len(); let end = src.len() - chars.as_str().len();
callback(start..end, res); callback(start..end, res.map(Into::into));
} }
} }

View file

@ -78,8 +78,7 @@ pub(crate) fn emit_unescape_error(
} }
}; };
let sugg = sugg.unwrap_or_else(|| { let sugg = sugg.unwrap_or_else(|| {
let is_byte = mode.is_byte(); let prefix = mode.prefix_noraw();
let prefix = if is_byte { "b" } else { "" };
let mut escaped = String::with_capacity(lit.len()); let mut escaped = String::with_capacity(lit.len());
let mut chrs = lit.chars().peekable(); let mut chrs = lit.chars().peekable();
while let Some(first) = chrs.next() { while let Some(first) = chrs.next() {
@ -97,7 +96,11 @@ pub(crate) fn emit_unescape_error(
}; };
} }
let sugg = format!("{prefix}\"{escaped}\""); let sugg = format!("{prefix}\"{escaped}\"");
MoreThanOneCharSugg::Quotes { span: span_with_quotes, is_byte, sugg } MoreThanOneCharSugg::Quotes {
span: span_with_quotes,
is_byte: mode == Mode::Byte,
sugg,
}
}); });
handler.emit_err(UnescapeError::MoreThanOneChar { handler.emit_err(UnescapeError::MoreThanOneChar {
span: span_with_quotes, span: span_with_quotes,
@ -112,7 +115,7 @@ pub(crate) fn emit_unescape_error(
char_span, char_span,
escaped_sugg: c.escape_default().to_string(), escaped_sugg: c.escape_default().to_string(),
escaped_msg: escaped_char(c), escaped_msg: escaped_char(c),
byte: mode.is_byte(), byte: mode == Mode::Byte,
}); });
} }
EscapeError::BareCarriageReturn => { EscapeError::BareCarriageReturn => {
@ -126,12 +129,15 @@ pub(crate) fn emit_unescape_error(
EscapeError::InvalidEscape => { EscapeError::InvalidEscape => {
let (c, span) = last_char(); let (c, span) = last_char();
let label = let label = if mode == Mode::Byte || mode == Mode::ByteStr {
if mode.is_byte() { "unknown byte escape" } else { "unknown character escape" }; "unknown byte escape"
} else {
"unknown character escape"
};
let ec = escaped_char(c); let ec = escaped_char(c);
let mut diag = handler.struct_span_err(span, &format!("{}: `{}`", label, ec)); let mut diag = handler.struct_span_err(span, &format!("{}: `{}`", label, ec));
diag.span_label(span, label); diag.span_label(span, label);
if c == '{' || c == '}' && !mode.is_byte() { if c == '{' || c == '}' && matches!(mode, Mode::Str | Mode::RawStr) {
diag.help( diag.help(
"if used in a formatting string, curly braces are escaped with `{{` and `}}`", "if used in a formatting string, curly braces are escaped with `{{` and `}}`",
); );
@ -141,7 +147,7 @@ pub(crate) fn emit_unescape_error(
version control settings", version control settings",
); );
} else { } else {
if !mode.is_byte() { if mode == Mode::Str || mode == Mode::Char {
diag.span_suggestion( diag.span_suggestion(
span_with_quotes, span_with_quotes,
"if you meant to write a literal backslash (perhaps escaping in a regular expression), consider a raw string literal", "if you meant to write a literal backslash (perhaps escaping in a regular expression), consider a raw string literal",