initial step towards implementing C string literals
This commit is contained in:
parent
7b99493492
commit
8ff3903643
17 changed files with 312 additions and 82 deletions
|
@ -186,12 +186,16 @@ pub enum LiteralKind {
|
|||
Str { terminated: bool },
|
||||
/// "b"abc"", "b"abc"
|
||||
ByteStr { terminated: bool },
|
||||
/// `c"abc"`, `c"abc`
|
||||
CStr { terminated: bool },
|
||||
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
|
||||
/// an invalid literal.
|
||||
RawStr { n_hashes: Option<u8> },
|
||||
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
|
||||
/// indicates an invalid literal.
|
||||
RawByteStr { n_hashes: Option<u8> },
|
||||
/// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` is invalid.
|
||||
RawCStr { n_hashes: Option<u8> },
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
|
@ -391,6 +395,32 @@ impl Cursor<'_> {
|
|||
_ => self.ident_or_unknown_prefix(),
|
||||
},
|
||||
|
||||
// TODO deduplicate this code
|
||||
// c-string literal, raw c-string literal or identifier.
|
||||
'c' => match (self.first(), self.second()) {
|
||||
('"', _) => {
|
||||
self.bump();
|
||||
let terminated = self.double_quoted_string();
|
||||
let suffix_start = self.pos_within_token();
|
||||
if terminated {
|
||||
self.eat_literal_suffix();
|
||||
}
|
||||
let kind = CStr { terminated };
|
||||
Literal { kind, suffix_start }
|
||||
}
|
||||
('r', '"') | ('r', '#') => {
|
||||
self.bump();
|
||||
let res = self.raw_double_quoted_string(2);
|
||||
let suffix_start = self.pos_within_token();
|
||||
if res.is_ok() {
|
||||
self.eat_literal_suffix();
|
||||
}
|
||||
let kind = RawCStr { n_hashes: res.ok() };
|
||||
Literal { kind, suffix_start }
|
||||
}
|
||||
_ => self.ident_or_unknown_prefix(),
|
||||
},
|
||||
|
||||
// Identifier (this should be checked after other variant that can
|
||||
// start as identifier).
|
||||
c if is_id_start(c) => self.ident_or_unknown_prefix(),
|
||||
|
|
|
@ -90,6 +90,39 @@ where
|
|||
Mode::RawStr | Mode::RawByteStr => {
|
||||
unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback)
|
||||
}
|
||||
Mode::CStr | Mode::RawCStr => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
pub enum CStrUnit {
|
||||
Byte(u8),
|
||||
Char(char),
|
||||
}
|
||||
|
||||
impl From<u8> for CStrUnit {
|
||||
fn from(value: u8) -> Self {
|
||||
CStrUnit::Byte(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<char> for CStrUnit {
|
||||
fn from(value: char) -> Self {
|
||||
CStrUnit::Char(value)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F)
|
||||
where
|
||||
F: FnMut(Range<usize>, Result<CStrUnit, EscapeError>),
|
||||
{
|
||||
if mode == Mode::RawCStr {
|
||||
unescape_raw_str_or_raw_byte_str(
|
||||
src,
|
||||
mode.characters_should_be_ascii(),
|
||||
&mut |r, result| callback(r, result.map(CStrUnit::Char)),
|
||||
);
|
||||
} else {
|
||||
unescape_str_common(src, mode, callback);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -114,19 +147,26 @@ pub enum Mode {
|
|||
ByteStr,
|
||||
RawStr,
|
||||
RawByteStr,
|
||||
CStr,
|
||||
RawCStr,
|
||||
}
|
||||
|
||||
impl Mode {
|
||||
pub fn in_double_quotes(self) -> bool {
|
||||
match self {
|
||||
Mode::Str | Mode::ByteStr | Mode::RawStr | Mode::RawByteStr => true,
|
||||
Mode::Str
|
||||
| Mode::ByteStr
|
||||
| Mode::RawStr
|
||||
| Mode::RawByteStr
|
||||
| Mode::CStr
|
||||
| Mode::RawCStr => true,
|
||||
Mode::Char | Mode::Byte => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_byte(self) -> bool {
|
||||
match self {
|
||||
Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
|
||||
Mode::Byte | Mode::ByteStr | Mode::RawByteStr | Mode::CStr | Mode::RawCStr => true,
|
||||
Mode::Char | Mode::Str | Mode::RawStr => false,
|
||||
}
|
||||
}
|
||||
|
@ -163,64 +203,65 @@ fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError
|
|||
value as char
|
||||
}
|
||||
|
||||
'u' => {
|
||||
// We've parsed '\u', now we have to parse '{..}'.
|
||||
|
||||
if chars.next() != Some('{') {
|
||||
return Err(EscapeError::NoBraceInUnicodeEscape);
|
||||
}
|
||||
|
||||
// First character must be a hexadecimal digit.
|
||||
let mut n_digits = 1;
|
||||
let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
|
||||
'_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
|
||||
'}' => return Err(EscapeError::EmptyUnicodeEscape),
|
||||
c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
|
||||
};
|
||||
|
||||
// First character is valid, now parse the rest of the number
|
||||
// and closing brace.
|
||||
loop {
|
||||
match chars.next() {
|
||||
None => return Err(EscapeError::UnclosedUnicodeEscape),
|
||||
Some('_') => continue,
|
||||
Some('}') => {
|
||||
if n_digits > 6 {
|
||||
return Err(EscapeError::OverlongUnicodeEscape);
|
||||
}
|
||||
|
||||
// Incorrect syntax has higher priority for error reporting
|
||||
// than unallowed value for a literal.
|
||||
if is_byte {
|
||||
return Err(EscapeError::UnicodeEscapeInByte);
|
||||
}
|
||||
|
||||
break std::char::from_u32(value).ok_or_else(|| {
|
||||
if value > 0x10FFFF {
|
||||
EscapeError::OutOfRangeUnicodeEscape
|
||||
} else {
|
||||
EscapeError::LoneSurrogateUnicodeEscape
|
||||
}
|
||||
})?;
|
||||
}
|
||||
Some(c) => {
|
||||
let digit: u32 =
|
||||
c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
|
||||
n_digits += 1;
|
||||
if n_digits > 6 {
|
||||
// Stop updating value since we're sure that it's incorrect already.
|
||||
continue;
|
||||
}
|
||||
value = value * 16 + digit;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
'u' => scan_unicode(chars, is_byte)?,
|
||||
_ => return Err(EscapeError::InvalidEscape),
|
||||
};
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
fn scan_unicode(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> {
|
||||
// We've parsed '\u', now we have to parse '{..}'.
|
||||
|
||||
if chars.next() != Some('{') {
|
||||
return Err(EscapeError::NoBraceInUnicodeEscape);
|
||||
}
|
||||
|
||||
// First character must be a hexadecimal digit.
|
||||
let mut n_digits = 1;
|
||||
let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
|
||||
'_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
|
||||
'}' => return Err(EscapeError::EmptyUnicodeEscape),
|
||||
c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
|
||||
};
|
||||
|
||||
// First character is valid, now parse the rest of the number
|
||||
// and closing brace.
|
||||
loop {
|
||||
match chars.next() {
|
||||
None => return Err(EscapeError::UnclosedUnicodeEscape),
|
||||
Some('_') => continue,
|
||||
Some('}') => {
|
||||
if n_digits > 6 {
|
||||
return Err(EscapeError::OverlongUnicodeEscape);
|
||||
}
|
||||
|
||||
// Incorrect syntax has higher priority for error reporting
|
||||
// than unallowed value for a literal.
|
||||
if is_byte {
|
||||
return Err(EscapeError::UnicodeEscapeInByte);
|
||||
}
|
||||
|
||||
break std::char::from_u32(value).ok_or_else(|| {
|
||||
if value > 0x10FFFF {
|
||||
EscapeError::OutOfRangeUnicodeEscape
|
||||
} else {
|
||||
EscapeError::LoneSurrogateUnicodeEscape
|
||||
}
|
||||
});
|
||||
}
|
||||
Some(c) => {
|
||||
let digit: u32 = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
|
||||
n_digits += 1;
|
||||
if n_digits > 6 {
|
||||
// Stop updating value since we're sure that it's incorrect already.
|
||||
continue;
|
||||
}
|
||||
value = value * 16 + digit;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn ascii_check(c: char, is_byte: bool) -> Result<char, EscapeError> {
|
||||
if is_byte && !c.is_ascii() {
|
||||
|
@ -266,7 +307,9 @@ where
|
|||
// if unescaped '\' character is followed by '\n'.
|
||||
// For details see [Rust language reference]
|
||||
// (https://doc.rust-lang.org/reference/tokens.html#string-literals).
|
||||
skip_ascii_whitespace(&mut chars, start, callback);
|
||||
skip_ascii_whitespace(&mut chars, start, &mut |range, err| {
|
||||
callback(range, Err(err))
|
||||
});
|
||||
continue;
|
||||
}
|
||||
_ => scan_escape(&mut chars, is_byte),
|
||||
|
@ -281,32 +324,32 @@ where
|
|||
let end = src.len() - chars.as_str().len();
|
||||
callback(start..end, res);
|
||||
}
|
||||
}
|
||||
|
||||
fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
|
||||
where
|
||||
F: FnMut(Range<usize>, Result<char, EscapeError>),
|
||||
{
|
||||
let tail = chars.as_str();
|
||||
let first_non_space = tail
|
||||
.bytes()
|
||||
.position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
|
||||
.unwrap_or(tail.len());
|
||||
if tail[1..first_non_space].contains('\n') {
|
||||
// The +1 accounts for the escaping slash.
|
||||
let end = start + first_non_space + 1;
|
||||
callback(start..end, Err(EscapeError::MultipleSkippedLinesWarning));
|
||||
}
|
||||
let tail = &tail[first_non_space..];
|
||||
if let Some(c) = tail.chars().nth(0) {
|
||||
if c.is_whitespace() {
|
||||
// For error reporting, we would like the span to contain the character that was not
|
||||
// skipped. The +1 is necessary to account for the leading \ that started the escape.
|
||||
let end = start + first_non_space + c.len_utf8() + 1;
|
||||
callback(start..end, Err(EscapeError::UnskippedWhitespaceWarning));
|
||||
}
|
||||
}
|
||||
*chars = tail.chars();
|
||||
fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
|
||||
where
|
||||
F: FnMut(Range<usize>, EscapeError),
|
||||
{
|
||||
let tail = chars.as_str();
|
||||
let first_non_space = tail
|
||||
.bytes()
|
||||
.position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
|
||||
.unwrap_or(tail.len());
|
||||
if tail[1..first_non_space].contains('\n') {
|
||||
// The +1 accounts for the escaping slash.
|
||||
let end = start + first_non_space + 1;
|
||||
callback(start..end, EscapeError::MultipleSkippedLinesWarning);
|
||||
}
|
||||
let tail = &tail[first_non_space..];
|
||||
if let Some(c) = tail.chars().nth(0) {
|
||||
if c.is_whitespace() {
|
||||
// For error reporting, we would like the span to contain the character that was not
|
||||
// skipped. The +1 is necessary to account for the leading \ that started the escape.
|
||||
let end = start + first_non_space + c.len_utf8() + 1;
|
||||
callback(start..end, EscapeError::UnskippedWhitespaceWarning);
|
||||
}
|
||||
}
|
||||
*chars = tail.chars();
|
||||
}
|
||||
|
||||
/// Takes a contents of a string literal (without quotes) and produces a
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue