1
Fork 0

Rollup merge of #114193 - crlf0710:lexer_unicode15, r=Manishearth

Update lexer emoji diagnostics to Unicode 15.0

This replaces the `unic-emoji-char` dep tree (which hasn't been updated for a while) with `unicode-properties` crate which contains Unicode 15.0 data.

Improves diagnostics for added emoji characters in recent years. (See tests).

cc #101840

cc ``@Manishearth``
This commit is contained in:
Matthias Krüger 2023-07-31 22:51:15 +02:00 committed by GitHub
commit 57c57a555b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 36 additions and 76 deletions

View file

@ -16,7 +16,11 @@ Rust lexer used by rustc. No stability guarantees are provided.
# Note that this crate purposefully does not depend on other rustc crates
[dependencies]
unicode-xid = "0.2.0"
unic-emoji-char = "0.9.0"
[dependencies.unicode-properties]
version = "0.1.0"
default-features = false
features = ["emoji"]
[dev-dependencies]
expect-test = "1.4.0"

View file

@ -34,6 +34,7 @@ pub use crate::cursor::Cursor;
use self::LiteralKind::*;
use self::TokenKind::*;
use crate::cursor::EOF_CHAR;
use unicode_properties::UnicodeEmoji;
/// Parsed token.
/// It doesn't contain information about data that has been parsed,
@ -428,9 +429,7 @@ impl Cursor<'_> {
Literal { kind, suffix_start }
}
// Identifier starting with an emoji. Only lexed for graceful error recovery.
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
self.fake_ident_or_unknown_prefix()
}
c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
_ => Unknown,
};
let res = Token::new(token_kind, self.pos_within_token());
@ -514,9 +513,7 @@ impl Cursor<'_> {
// we see a prefix here, it is definitely an unknown prefix.
match self.first() {
'#' | '"' | '\'' => UnknownPrefix,
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
self.fake_ident_or_unknown_prefix()
}
c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
_ => Ident,
}
}
@ -525,7 +522,7 @@ impl Cursor<'_> {
// Start is already eaten, eat the rest of identifier.
self.eat_while(|c| {
unicode_xid::UnicodeXID::is_xid_continue(c)
|| (!c.is_ascii() && unic_emoji_char::is_emoji(c))
|| (!c.is_ascii() && c.is_emoji_char())
|| c == '\u{200d}'
});
// Known prefixes must have been handled earlier. So if