Auto merge of #88781 - estebank:emoji-idents, r=oli-obk

Tokenize emoji as if they were valid identifiers In the lexer, consider emojis to be valid identifiers and reject them later to avoid knock down parse errors. Partially address #86102.
2021-11-25 08:16:08 +00:00 · 2021-11-25 08:16:08 +00:00 · 23a436606b
commit 23a436606b
parent c6eda7d8a7 d92916439c
13 changed files with 223 additions and 15 deletions
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@ -1,3 +1,4 @@
+use crate::lexer::unicode_chars::UNICODE_ARRAY;
 use rustc_ast::ast::{self, AttrStyle};
 use rustc_ast::token::{self, CommentKind, Token, TokenKind};
 use rustc_ast::tokenstream::{Spacing, TokenStream};
@ -222,6 +223,22 @@ impl<'a> StringReader<'a> {
                }
                token::Ident(sym, is_raw_ident)
            }
+            rustc_lexer::TokenKind::InvalidIdent
+                // Do not recover an identifier with emoji if the codepoint is a confusable
+                // with a recoverable substitution token, like `➖`.
+                if UNICODE_ARRAY
+                    .iter()
+                    .find(|&&(c, _, _)| {
+                        let sym = self.str_from(start);
+                        sym.chars().count() == 1 && c == sym.chars().next().unwrap()
+                    })
+                    .is_none() =>
+            {
+                let sym = nfc_normalize(self.str_from(start));
+                let span = self.mk_sp(start, self.pos);
+                self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
+                token::Ident(sym, false)
+            }
            rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
                let suffix_start = start + BytePos(suffix_start as u32);
                let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
@ -293,7 +310,7 @@ impl<'a> StringReader<'a> {
            rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
            rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),

-            rustc_lexer::TokenKind::Unknown => {
+            rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
                let c = self.str_from(start).chars().next().unwrap();
                let mut err =
                    self.struct_fatal_span_char(start, self.pos, "unknown start of token", c);
--- a/compiler/rustc_parse/src/lexer/unicode_chars.rs
+++ b/compiler/rustc_parse/src/lexer/unicode_chars.rs
@ -7,7 +7,7 @@ use rustc_errors::{Applicability, DiagnosticBuilder};
 use rustc_span::{symbol::kw, BytePos, Pos, Span};

 #[rustfmt::skip] // for line breaks
-const UNICODE_ARRAY: &[(char, &str, char)] = &[
+pub(crate) const UNICODE_ARRAY: &[(char, &str, char)] = &[
    (' ', "Line Separator", ' '),
    (' ', "Paragraph Separator", ' '),
    (' ', "Ogham Space mark", ' '),