Rollup merge of #133070 - nnethercote:lexer-tweaks, r=chenyukang

Lexer tweaks Some cleanups and small performance improvements. r? ```@chenyukang```
2024-11-26 12:03:39 -05:00 · 2024-11-26 12:03:39 -05:00 · 9d6a11a435
commit 9d6a11a435
parent 5915190fed 16a39bb7ca
5 changed files with 124 additions and 128 deletions
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@ -566,19 +566,19 @@ impl Cursor<'_> {
    fn c_or_byte_string(
        &mut self,
-        mk_kind: impl FnOnce(bool) -> LiteralKind,
+        mk_kind: fn(bool) -> LiteralKind,
-        mk_kind_raw: impl FnOnce(Option<u8>) -> LiteralKind,
+        mk_kind_raw: fn(Option<u8>) -> LiteralKind,
        single_quoted: Option<fn(bool) -> LiteralKind>,
    ) -> TokenKind {
        match (self.first(), self.second(), single_quoted) {
-            ('\'', _, Some(mk_kind)) => {
+            ('\'', _, Some(single_quoted)) => {
                self.bump();
                let terminated = self.single_quoted_string();
                let suffix_start = self.pos_within_token();
                if terminated {
                    self.eat_literal_suffix();
                }
-                let kind = mk_kind(terminated);
+                let kind = single_quoted(terminated);
                Literal { kind, suffix_start }
            }
            ('"', _, _) => {
--- a/compiler/rustc_lexer/src/tests.rs
+++ b/compiler/rustc_lexer/src/tests.rs
@ -77,63 +77,53 @@ fn test_too_many_hashes() {
    check_raw_str(&s2, Err(RawStrError::TooManyDelimiters { found: u32::from(max_count) + 1 }));
 }
 // https://github.com/rust-lang/rust/issues/70528
 #[test]
 fn test_valid_shebang() {
-    // https://github.com/rust-lang/rust/issues/70528
+    let input = "#!/bin/bash";
-    let input = "#!/usr/bin/rustrun\nlet x = 5;";
+    assert_eq!(strip_shebang(input), Some(input.len()));
    assert_eq!(strip_shebang(input), Some(18));
 }
-#[test]
+    let input = "#![attribute]";
-fn test_invalid_shebang_valid_rust_syntax() {
+    assert_eq!(strip_shebang(input), None);
-    // https://github.com/rust-lang/rust/issues/70528
+
-    let input = "#!    [bad_attribute]";
+    let input = "#!    /bin/bash";
    assert_eq!(strip_shebang(input), Some(input.len()));
    let input = "#!    [attribute]";
    assert_eq!(strip_shebang(input), None);
    let input = "#! /* blah */  /bin/bash";
    assert_eq!(strip_shebang(input), Some(input.len()));
    let input = "#! /* blah */  [attribute]";
    assert_eq!(strip_shebang(input), None);
    let input = "#! // blah\n/bin/bash";
    assert_eq!(strip_shebang(input), Some(10)); // strip up to the newline
    let input = "#! // blah\n[attribute]";
    assert_eq!(strip_shebang(input), None);
    let input = "#! /* blah\nblah\nblah */  /bin/bash";
    assert_eq!(strip_shebang(input), Some(10));
    let input = "#! /* blah\nblah\nblah */  [attribute]";
    assert_eq!(strip_shebang(input), None);
    let input = "#!\n/bin/sh";
    assert_eq!(strip_shebang(input), Some(2));
    let input = "#!\n[attribute]";
    assert_eq!(strip_shebang(input), None);
 }
 #[test]
 fn test_shebang_second_line() {
    // Because shebangs are interpreted by the kernel, they must be on the first line
    let input = "\n#!/bin/bash";
    assert_eq!(strip_shebang(input), None);
 }
-#[test]
+    let input = "\n#![attribute]";
 fn test_shebang_space() {
    let input = "#!    /bin/bash";
    assert_eq!(strip_shebang(input), Some(input.len()));
 }
 #[test]
 fn test_shebang_empty_shebang() {
    let input = "#!    \n[attribute(foo)]";
    assert_eq!(strip_shebang(input), None);
 }
 #[test]
 fn test_invalid_shebang_comment() {
    let input = "#!//bin/ami/a/comment\n[";
    assert_eq!(strip_shebang(input), None)
 }
 #[test]
 fn test_invalid_shebang_another_comment() {
    let input = "#!/*bin/ami/a/comment*/\n[attribute";
    assert_eq!(strip_shebang(input), None)
 }
 #[test]
 fn test_shebang_valid_rust_after() {
    let input = "#!/*bin/ami/a/comment*/\npub fn main() {}";
    assert_eq!(strip_shebang(input), Some(23))
 }
 #[test]
 fn test_shebang_followed_by_attrib() {
    let input = "#!/bin/rust-scripts\n#![allow_unused(true)]";
    assert_eq!(strip_shebang(input), Some(19));
 }
 fn check_lexing(src: &str, expect: Expect) {
    let actual: String = tokenize(src).map(|token| format!("{:?}\n", token)).collect();
    expect.assert_eq(&actual)
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@ -18,6 +18,7 @@ use rustc_span::symbol::Symbol;
 use rustc_span::{BytePos, Pos, Span};
 use tracing::debug;
 use crate::lexer::diagnostics::TokenTreeDiagInfo;
 use crate::lexer::unicode_chars::UNICODE_ARRAY;
 use crate::{errors, make_unclosed_delims_error};
@ -56,7 +57,7 @@ pub(crate) fn lex_token_trees<'psess, 'src>(
    }
    let cursor = Cursor::new(src);
-    let string_reader = StringReader {
+    let mut lexer = Lexer {
        psess,
        start_pos,
        pos: start_pos,
@ -65,34 +66,31 @@ pub(crate) fn lex_token_trees<'psess, 'src>(
        override_span,
        nbsp_is_whitespace: false,
        last_lifetime: None,
        token: Token::dummy(),
        diag_info: TokenTreeDiagInfo::default(),
    };
-    let (stream, res, unmatched_delims) =
+    let (_open_spacing, stream, res) = lexer.lex_token_trees(/* is_delimited */ false);
-        tokentrees::TokenTreesReader::lex_all_token_trees(string_reader);
+    let unmatched_delims = lexer.diag_info.unmatched_delims;
-    match res {
+
-        Ok(()) if unmatched_delims.is_empty() => Ok(stream),
+    if res.is_ok() && unmatched_delims.is_empty() {
-        _ => {
+        Ok(stream)
    } else {
        // Return error if there are unmatched delimiters or unclosed delimiters.
        // We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch
        // because the delimiter mismatch is more likely to be the root cause of error
-
+        let mut buffer: Vec<_> = unmatched_delims
-            let mut buffer = Vec::with_capacity(1);
+            .into_iter()
-            for unmatched in unmatched_delims {
+            .filter_map(|unmatched_delim| make_unclosed_delims_error(unmatched_delim, psess))
-                if let Some(err) = make_unclosed_delims_error(unmatched, psess) {
+            .collect();
                    buffer.push(err);
                }
            }
        if let Err(errs) = res {
            // Add unclosing delimiter or diff marker errors
-                for err in errs {
+            buffer.extend(errs);
                    buffer.push(err);
                }
        }
        Err(buffer)
    }
 }
 }
-struct StringReader<'psess, 'src> {
+struct Lexer<'psess, 'src> {
    psess: &'psess ParseSess,
    /// Initial position, read-only.
    start_pos: BytePos,
@ -111,9 +109,14 @@ struct StringReader<'psess, 'src> {
    /// Track the `Span` for the leading `'` of the last lifetime. Used for
    /// diagnostics to detect possible typo where `"` was meant.
    last_lifetime: Option<Span>,
    /// The current token.
    token: Token,
    diag_info: TokenTreeDiagInfo,
 }
-impl<'psess, 'src> StringReader<'psess, 'src> {
+impl<'psess, 'src> Lexer<'psess, 'src> {
    fn dcx(&self) -> DiagCtxtHandle<'psess> {
        self.psess.dcx()
    }
@ -124,7 +127,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
    /// Returns the next token, paired with a bool indicating if the token was
    /// preceded by whitespace.
-    fn next_token(&mut self) -> (Token, bool) {
+    fn next_token_from_cursor(&mut self) -> (Token, bool) {
        let mut preceded_by_whitespace = false;
        let mut swallow_next_invalid = 0;
        // Skip trivial (whitespace & comments) tokens
@ -231,7 +234,8 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
                        .push(span);
                    token::Ident(sym, IdentIsRaw::No)
                }
-                // split up (raw) c string literals to an ident and a string literal when edition < 2021.
+                // split up (raw) c string literals to an ident and a string literal when edition <
                // 2021.
                rustc_lexer::TokenKind::Literal {
                    kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }),
                    suffix_start: _,
@ -252,7 +256,9 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
                    let prefix_span = self.mk_sp(start, lit_start);
                    return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
                }
-                rustc_lexer::TokenKind::GuardedStrPrefix => self.maybe_report_guarded_str(start, str_before),
+                rustc_lexer::TokenKind::GuardedStrPrefix => {
                    self.maybe_report_guarded_str(start, str_before)
                }
                rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
                    let suffix_start = start + BytePos(suffix_start);
                    let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
@ -296,13 +302,20 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
                    if prefix_span.at_least_rust_2021() {
                        let span = self.mk_sp(start, self.pos);
-                        let lifetime_name_without_tick = Symbol::intern(&self.str_from(ident_start));
+                        let lifetime_name_without_tick =
                            Symbol::intern(&self.str_from(ident_start));
                        if !lifetime_name_without_tick.can_be_raw() {
-                            self.dcx().emit_err(errors::CannotBeRawLifetime { span, ident: lifetime_name_without_tick });
+                            self.dcx().emit_err(
                                errors::CannotBeRawLifetime {
                                    span,
                                    ident: lifetime_name_without_tick
                                }
                            );
                        }
                        // Put the `'` back onto the lifetime name.
-                        let mut lifetime_name = String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
+                        let mut lifetime_name =
                            String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
                        lifetime_name.push('\'');
                        lifetime_name += lifetime_name_without_tick.as_str();
                        let sym = Symbol::intern(&lifetime_name);
--- a/compiler/rustc_parse/src/lexer/tokentrees.rs
+++ b/compiler/rustc_parse/src/lexer/tokentrees.rs
@ -4,41 +4,19 @@ use rustc_ast_pretty::pprust::token_to_string;
 use rustc_errors::{Applicability, PErr};
 use rustc_span::symbol::kw;
-use super::diagnostics::{
+use super::diagnostics::{report_suspicious_mismatch_block, same_indentation_level};
-    TokenTreeDiagInfo, report_suspicious_mismatch_block, same_indentation_level,
+use super::{Lexer, UnmatchedDelim};
 };
 use super::{StringReader, UnmatchedDelim};
 use crate::Parser;
-pub(super) struct TokenTreesReader<'psess, 'src> {
+impl<'psess, 'src> Lexer<'psess, 'src> {
    string_reader: StringReader<'psess, 'src>,
    /// The "next" token, which has been obtained from the `StringReader` but
    /// not yet handled by the `TokenTreesReader`.
    token: Token,
    diag_info: TokenTreeDiagInfo,
 }
 impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
    pub(super) fn lex_all_token_trees(
        string_reader: StringReader<'psess, 'src>,
    ) -> (TokenStream, Result<(), Vec<PErr<'psess>>>, Vec<UnmatchedDelim>) {
        let mut tt_reader = TokenTreesReader {
            string_reader,
            token: Token::dummy(),
            diag_info: TokenTreeDiagInfo::default(),
        };
        let (_open_spacing, stream, res) = tt_reader.lex_token_trees(/* is_delimited */ false);
        (stream, res, tt_reader.diag_info.unmatched_delims)
    }
    // Lex into a token stream. The `Spacing` in the result is that of the
    // opening delimiter.
-    fn lex_token_trees(
+    pub(super) fn lex_token_trees(
        &mut self,
        is_delimited: bool,
    ) -> (Spacing, TokenStream, Result<(), Vec<PErr<'psess>>>) {
        // Move past the opening delimiter.
-        let (_, open_spacing) = self.bump(false);
+        let open_spacing = self.bump_minimal();
        let mut buf = Vec::new();
        loop {
@ -71,7 +49,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
                }
                _ => {
                    // Get the next normal token.
-                    let (this_tok, this_spacing) = self.bump(true);
+                    let (this_tok, this_spacing) = self.bump();
                    buf.push(TokenTree::Token(this_tok, this_spacing));
                }
            }
@ -80,7 +58,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
    fn eof_err(&mut self) -> PErr<'psess> {
        let msg = "this file contains an unclosed delimiter";
-        let mut err = self.string_reader.dcx().struct_span_err(self.token.span, msg);
+        let mut err = self.dcx().struct_span_err(self.token.span, msg);
        let unclosed_delimiter_show_limit = 5;
        let len = usize::min(unclosed_delimiter_show_limit, self.diag_info.open_braces.len());
@ -110,7 +88,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
            report_suspicious_mismatch_block(
                &mut err,
                &self.diag_info,
-                self.string_reader.psess.source_map(),
+                self.psess.source_map(),
                *delim,
            )
        }
@ -136,7 +114,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
        // Expand to cover the entire delimited token tree.
        let delim_span = DelimSpan::from_pair(pre_span, self.token.span);
-        let sm = self.string_reader.psess.source_map();
+        let sm = self.psess.source_map();
        let close_spacing = match self.token.kind {
            // Correct delimiter.
@ -160,7 +138,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
                }
                // Move past the closing delimiter.
-                self.bump(false).1
+                self.bump_minimal()
            }
            // Incorrect delimiter.
            token::CloseDelim(close_delim) => {
@ -203,7 +181,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
                //     bar(baz(
                // }  // Incorrect delimiter but matches the earlier `{`
                if !self.diag_info.open_braces.iter().any(|&(b, _)| b == close_delim) {
-                    self.bump(false).1
+                    self.bump_minimal()
                } else {
                    // The choice of value here doesn't matter.
                    Spacing::Alone
@ -225,14 +203,14 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
    }
    // Move on to the next token, returning the current token and its spacing.
-    // Will glue adjacent single-char tokens together if `glue` is set.
+    // Will glue adjacent single-char tokens together.
-    fn bump(&mut self, glue: bool) -> (Token, Spacing) {
+    fn bump(&mut self) -> (Token, Spacing) {
        let (this_spacing, next_tok) = loop {
-            let (next_tok, is_next_tok_preceded_by_whitespace) = self.string_reader.next_token();
+            let (next_tok, is_next_tok_preceded_by_whitespace) = self.next_token_from_cursor();
            if is_next_tok_preceded_by_whitespace {
                break (Spacing::Alone, next_tok);
-            } else if glue && let Some(glued) = self.token.glue(&next_tok) {
+            } else if let Some(glued) = self.token.glue(&next_tok) {
                self.token = glued;
            } else {
                let this_spacing = if next_tok.is_punct() {
@ -249,6 +227,26 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
        (this_tok, this_spacing)
    }
    // Cut-down version of `bump` used when the token kind is known in advance.
    fn bump_minimal(&mut self) -> Spacing {
        let (next_tok, is_next_tok_preceded_by_whitespace) = self.next_token_from_cursor();
        let this_spacing = if is_next_tok_preceded_by_whitespace {
            Spacing::Alone
        } else {
            if next_tok.is_punct() {
                Spacing::Joint
            } else if next_tok == token::Eof {
                Spacing::Alone
            } else {
                Spacing::JointHidden
            }
        };
        self.token = next_tok;
        this_spacing
    }
    fn unclosed_delim_err(
        &mut self,
        tts: TokenStream,
@ -256,7 +254,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
    ) -> Vec<PErr<'psess>> {
        // If there are unclosed delims, see if there are diff markers and if so, point them
        // out instead of complaining about the unclosed delims.
-        let mut parser = Parser::new(self.string_reader.psess, tts, None);
+        let mut parser = Parser::new(self.psess, tts, None);
        let mut diff_errs = vec![];
        // Suggest removing a `{` we think appears in an `if`/`while` condition.
        // We want to suggest removing a `{` only if we think we're in an `if`/`while` condition,
@ -314,14 +312,9 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
        // An unexpected closing delimiter (i.e., there is no matching opening delimiter).
        let token_str = token_to_string(&self.token);
        let msg = format!("unexpected closing delimiter: `{token_str}`");
-        let mut err = self.string_reader.dcx().struct_span_err(self.token.span, msg);
+        let mut err = self.dcx().struct_span_err(self.token.span, msg);
-        report_suspicious_mismatch_block(
+        report_suspicious_mismatch_block(&mut err, &self.diag_info, self.psess.source_map(), delim);
            &mut err,
            &self.diag_info,
            self.string_reader.psess.source_map(),
            delim,
        );
        err.span_label(self.token.span, "unexpected closing delimiter");
        err
    }
--- a/compiler/rustc_parse/src/lexer/unicode_chars.rs
+++ b/compiler/rustc_parse/src/lexer/unicode_chars.rs
@ -4,7 +4,7 @@
 use rustc_span::symbol::kw;
 use rustc_span::{BytePos, Pos, Span};
-use super::StringReader;
+use super::Lexer;
 use crate::errors::TokenSubstitution;
 use crate::token::{self, Delimiter};
@ -338,7 +338,7 @@ const ASCII_ARRAY: &[(&str, &str, Option<token::TokenKind>)] = &[
 ];
 pub(super) fn check_for_substitution(
-    reader: &StringReader<'_, '_>,
+    lexer: &Lexer<'_, '_>,
    pos: BytePos,
    ch: char,
    count: usize,
@ -351,11 +351,11 @@ pub(super) fn check_for_substitution(
    let Some((_, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(s, _, _)| s == ascii_str) else {
        let msg = format!("substitution character not found for '{ch}'");
-        reader.dcx().span_bug(span, msg);
+        lexer.dcx().span_bug(span, msg);
    };
    // special help suggestion for "directed" double quotes
-    let sugg = if let Some(s) = peek_delimited(&reader.src[reader.src_index(pos)..], '“', '”') {
+    let sugg = if let Some(s) = peek_delimited(&lexer.src[lexer.src_index(pos)..], '“', '”') {
        let span = Span::with_root_ctxt(
            pos,
            pos + Pos::from_usize('“'.len_utf8() + s.len() + '”'.len_utf8()),