1
Fork 0

Rollup merge of #133070 - nnethercote:lexer-tweaks, r=chenyukang

Lexer tweaks

Some cleanups and small performance improvements.

r? ```@chenyukang```
This commit is contained in:
Michael Goulet 2024-11-26 12:03:39 -05:00 committed by GitHub
commit 9d6a11a435
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 124 additions and 128 deletions

View file

@ -566,19 +566,19 @@ impl Cursor<'_> {
fn c_or_byte_string( fn c_or_byte_string(
&mut self, &mut self,
mk_kind: impl FnOnce(bool) -> LiteralKind, mk_kind: fn(bool) -> LiteralKind,
mk_kind_raw: impl FnOnce(Option<u8>) -> LiteralKind, mk_kind_raw: fn(Option<u8>) -> LiteralKind,
single_quoted: Option<fn(bool) -> LiteralKind>, single_quoted: Option<fn(bool) -> LiteralKind>,
) -> TokenKind { ) -> TokenKind {
match (self.first(), self.second(), single_quoted) { match (self.first(), self.second(), single_quoted) {
('\'', _, Some(mk_kind)) => { ('\'', _, Some(single_quoted)) => {
self.bump(); self.bump();
let terminated = self.single_quoted_string(); let terminated = self.single_quoted_string();
let suffix_start = self.pos_within_token(); let suffix_start = self.pos_within_token();
if terminated { if terminated {
self.eat_literal_suffix(); self.eat_literal_suffix();
} }
let kind = mk_kind(terminated); let kind = single_quoted(terminated);
Literal { kind, suffix_start } Literal { kind, suffix_start }
} }
('"', _, _) => { ('"', _, _) => {

View file

@ -77,63 +77,53 @@ fn test_too_many_hashes() {
check_raw_str(&s2, Err(RawStrError::TooManyDelimiters { found: u32::from(max_count) + 1 })); check_raw_str(&s2, Err(RawStrError::TooManyDelimiters { found: u32::from(max_count) + 1 }));
} }
// https://github.com/rust-lang/rust/issues/70528
#[test] #[test]
fn test_valid_shebang() { fn test_valid_shebang() {
// https://github.com/rust-lang/rust/issues/70528 let input = "#!/bin/bash";
let input = "#!/usr/bin/rustrun\nlet x = 5;"; assert_eq!(strip_shebang(input), Some(input.len()));
assert_eq!(strip_shebang(input), Some(18));
}
#[test] let input = "#![attribute]";
fn test_invalid_shebang_valid_rust_syntax() { assert_eq!(strip_shebang(input), None);
// https://github.com/rust-lang/rust/issues/70528
let input = "#! [bad_attribute]"; let input = "#! /bin/bash";
assert_eq!(strip_shebang(input), Some(input.len()));
let input = "#! [attribute]";
assert_eq!(strip_shebang(input), None);
let input = "#! /* blah */ /bin/bash";
assert_eq!(strip_shebang(input), Some(input.len()));
let input = "#! /* blah */ [attribute]";
assert_eq!(strip_shebang(input), None);
let input = "#! // blah\n/bin/bash";
assert_eq!(strip_shebang(input), Some(10)); // strip up to the newline
let input = "#! // blah\n[attribute]";
assert_eq!(strip_shebang(input), None);
let input = "#! /* blah\nblah\nblah */ /bin/bash";
assert_eq!(strip_shebang(input), Some(10));
let input = "#! /* blah\nblah\nblah */ [attribute]";
assert_eq!(strip_shebang(input), None);
let input = "#!\n/bin/sh";
assert_eq!(strip_shebang(input), Some(2));
let input = "#!\n[attribute]";
assert_eq!(strip_shebang(input), None); assert_eq!(strip_shebang(input), None);
}
#[test]
fn test_shebang_second_line() {
// Because shebangs are interpreted by the kernel, they must be on the first line // Because shebangs are interpreted by the kernel, they must be on the first line
let input = "\n#!/bin/bash"; let input = "\n#!/bin/bash";
assert_eq!(strip_shebang(input), None); assert_eq!(strip_shebang(input), None);
}
#[test] let input = "\n#![attribute]";
fn test_shebang_space() {
let input = "#! /bin/bash";
assert_eq!(strip_shebang(input), Some(input.len()));
}
#[test]
fn test_shebang_empty_shebang() {
let input = "#! \n[attribute(foo)]";
assert_eq!(strip_shebang(input), None); assert_eq!(strip_shebang(input), None);
} }
#[test]
fn test_invalid_shebang_comment() {
let input = "#!//bin/ami/a/comment\n[";
assert_eq!(strip_shebang(input), None)
}
#[test]
fn test_invalid_shebang_another_comment() {
let input = "#!/*bin/ami/a/comment*/\n[attribute";
assert_eq!(strip_shebang(input), None)
}
#[test]
fn test_shebang_valid_rust_after() {
let input = "#!/*bin/ami/a/comment*/\npub fn main() {}";
assert_eq!(strip_shebang(input), Some(23))
}
#[test]
fn test_shebang_followed_by_attrib() {
let input = "#!/bin/rust-scripts\n#![allow_unused(true)]";
assert_eq!(strip_shebang(input), Some(19));
}
fn check_lexing(src: &str, expect: Expect) { fn check_lexing(src: &str, expect: Expect) {
let actual: String = tokenize(src).map(|token| format!("{:?}\n", token)).collect(); let actual: String = tokenize(src).map(|token| format!("{:?}\n", token)).collect();
expect.assert_eq(&actual) expect.assert_eq(&actual)

View file

@ -18,6 +18,7 @@ use rustc_span::symbol::Symbol;
use rustc_span::{BytePos, Pos, Span}; use rustc_span::{BytePos, Pos, Span};
use tracing::debug; use tracing::debug;
use crate::lexer::diagnostics::TokenTreeDiagInfo;
use crate::lexer::unicode_chars::UNICODE_ARRAY; use crate::lexer::unicode_chars::UNICODE_ARRAY;
use crate::{errors, make_unclosed_delims_error}; use crate::{errors, make_unclosed_delims_error};
@ -56,7 +57,7 @@ pub(crate) fn lex_token_trees<'psess, 'src>(
} }
let cursor = Cursor::new(src); let cursor = Cursor::new(src);
let string_reader = StringReader { let mut lexer = Lexer {
psess, psess,
start_pos, start_pos,
pos: start_pos, pos: start_pos,
@ -65,34 +66,31 @@ pub(crate) fn lex_token_trees<'psess, 'src>(
override_span, override_span,
nbsp_is_whitespace: false, nbsp_is_whitespace: false,
last_lifetime: None, last_lifetime: None,
token: Token::dummy(),
diag_info: TokenTreeDiagInfo::default(),
}; };
let (stream, res, unmatched_delims) = let (_open_spacing, stream, res) = lexer.lex_token_trees(/* is_delimited */ false);
tokentrees::TokenTreesReader::lex_all_token_trees(string_reader); let unmatched_delims = lexer.diag_info.unmatched_delims;
match res {
Ok(()) if unmatched_delims.is_empty() => Ok(stream), if res.is_ok() && unmatched_delims.is_empty() {
_ => { Ok(stream)
} else {
// Return error if there are unmatched delimiters or unclosed delimiters. // Return error if there are unmatched delimiters or unclosed delimiters.
// We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch // We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch
// because the delimiter mismatch is more likely to be the root cause of error // because the delimiter mismatch is more likely to be the root cause of error
let mut buffer: Vec<_> = unmatched_delims
let mut buffer = Vec::with_capacity(1); .into_iter()
for unmatched in unmatched_delims { .filter_map(|unmatched_delim| make_unclosed_delims_error(unmatched_delim, psess))
if let Some(err) = make_unclosed_delims_error(unmatched, psess) { .collect();
buffer.push(err);
}
}
if let Err(errs) = res { if let Err(errs) = res {
// Add unclosing delimiter or diff marker errors // Add unclosing delimiter or diff marker errors
for err in errs { buffer.extend(errs);
buffer.push(err);
}
} }
Err(buffer) Err(buffer)
} }
} }
}
struct StringReader<'psess, 'src> { struct Lexer<'psess, 'src> {
psess: &'psess ParseSess, psess: &'psess ParseSess,
/// Initial position, read-only. /// Initial position, read-only.
start_pos: BytePos, start_pos: BytePos,
@ -111,9 +109,14 @@ struct StringReader<'psess, 'src> {
/// Track the `Span` for the leading `'` of the last lifetime. Used for /// Track the `Span` for the leading `'` of the last lifetime. Used for
/// diagnostics to detect possible typo where `"` was meant. /// diagnostics to detect possible typo where `"` was meant.
last_lifetime: Option<Span>, last_lifetime: Option<Span>,
/// The current token.
token: Token,
diag_info: TokenTreeDiagInfo,
} }
impl<'psess, 'src> StringReader<'psess, 'src> { impl<'psess, 'src> Lexer<'psess, 'src> {
fn dcx(&self) -> DiagCtxtHandle<'psess> { fn dcx(&self) -> DiagCtxtHandle<'psess> {
self.psess.dcx() self.psess.dcx()
} }
@ -124,7 +127,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
/// Returns the next token, paired with a bool indicating if the token was /// Returns the next token, paired with a bool indicating if the token was
/// preceded by whitespace. /// preceded by whitespace.
fn next_token(&mut self) -> (Token, bool) { fn next_token_from_cursor(&mut self) -> (Token, bool) {
let mut preceded_by_whitespace = false; let mut preceded_by_whitespace = false;
let mut swallow_next_invalid = 0; let mut swallow_next_invalid = 0;
// Skip trivial (whitespace & comments) tokens // Skip trivial (whitespace & comments) tokens
@ -231,7 +234,8 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
.push(span); .push(span);
token::Ident(sym, IdentIsRaw::No) token::Ident(sym, IdentIsRaw::No)
} }
// split up (raw) c string literals to an ident and a string literal when edition < 2021. // split up (raw) c string literals to an ident and a string literal when edition <
// 2021.
rustc_lexer::TokenKind::Literal { rustc_lexer::TokenKind::Literal {
kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }), kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }),
suffix_start: _, suffix_start: _,
@ -252,7 +256,9 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
let prefix_span = self.mk_sp(start, lit_start); let prefix_span = self.mk_sp(start, lit_start);
return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace); return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
} }
rustc_lexer::TokenKind::GuardedStrPrefix => self.maybe_report_guarded_str(start, str_before), rustc_lexer::TokenKind::GuardedStrPrefix => {
self.maybe_report_guarded_str(start, str_before)
}
rustc_lexer::TokenKind::Literal { kind, suffix_start } => { rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
let suffix_start = start + BytePos(suffix_start); let suffix_start = start + BytePos(suffix_start);
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind); let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
@ -296,13 +302,20 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
if prefix_span.at_least_rust_2021() { if prefix_span.at_least_rust_2021() {
let span = self.mk_sp(start, self.pos); let span = self.mk_sp(start, self.pos);
let lifetime_name_without_tick = Symbol::intern(&self.str_from(ident_start)); let lifetime_name_without_tick =
Symbol::intern(&self.str_from(ident_start));
if !lifetime_name_without_tick.can_be_raw() { if !lifetime_name_without_tick.can_be_raw() {
self.dcx().emit_err(errors::CannotBeRawLifetime { span, ident: lifetime_name_without_tick }); self.dcx().emit_err(
errors::CannotBeRawLifetime {
span,
ident: lifetime_name_without_tick
}
);
} }
// Put the `'` back onto the lifetime name. // Put the `'` back onto the lifetime name.
let mut lifetime_name = String::with_capacity(lifetime_name_without_tick.as_str().len() + 1); let mut lifetime_name =
String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
lifetime_name.push('\''); lifetime_name.push('\'');
lifetime_name += lifetime_name_without_tick.as_str(); lifetime_name += lifetime_name_without_tick.as_str();
let sym = Symbol::intern(&lifetime_name); let sym = Symbol::intern(&lifetime_name);

View file

@ -4,41 +4,19 @@ use rustc_ast_pretty::pprust::token_to_string;
use rustc_errors::{Applicability, PErr}; use rustc_errors::{Applicability, PErr};
use rustc_span::symbol::kw; use rustc_span::symbol::kw;
use super::diagnostics::{ use super::diagnostics::{report_suspicious_mismatch_block, same_indentation_level};
TokenTreeDiagInfo, report_suspicious_mismatch_block, same_indentation_level, use super::{Lexer, UnmatchedDelim};
};
use super::{StringReader, UnmatchedDelim};
use crate::Parser; use crate::Parser;
pub(super) struct TokenTreesReader<'psess, 'src> { impl<'psess, 'src> Lexer<'psess, 'src> {
string_reader: StringReader<'psess, 'src>,
/// The "next" token, which has been obtained from the `StringReader` but
/// not yet handled by the `TokenTreesReader`.
token: Token,
diag_info: TokenTreeDiagInfo,
}
impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
pub(super) fn lex_all_token_trees(
string_reader: StringReader<'psess, 'src>,
) -> (TokenStream, Result<(), Vec<PErr<'psess>>>, Vec<UnmatchedDelim>) {
let mut tt_reader = TokenTreesReader {
string_reader,
token: Token::dummy(),
diag_info: TokenTreeDiagInfo::default(),
};
let (_open_spacing, stream, res) = tt_reader.lex_token_trees(/* is_delimited */ false);
(stream, res, tt_reader.diag_info.unmatched_delims)
}
// Lex into a token stream. The `Spacing` in the result is that of the // Lex into a token stream. The `Spacing` in the result is that of the
// opening delimiter. // opening delimiter.
fn lex_token_trees( pub(super) fn lex_token_trees(
&mut self, &mut self,
is_delimited: bool, is_delimited: bool,
) -> (Spacing, TokenStream, Result<(), Vec<PErr<'psess>>>) { ) -> (Spacing, TokenStream, Result<(), Vec<PErr<'psess>>>) {
// Move past the opening delimiter. // Move past the opening delimiter.
let (_, open_spacing) = self.bump(false); let open_spacing = self.bump_minimal();
let mut buf = Vec::new(); let mut buf = Vec::new();
loop { loop {
@ -71,7 +49,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
} }
_ => { _ => {
// Get the next normal token. // Get the next normal token.
let (this_tok, this_spacing) = self.bump(true); let (this_tok, this_spacing) = self.bump();
buf.push(TokenTree::Token(this_tok, this_spacing)); buf.push(TokenTree::Token(this_tok, this_spacing));
} }
} }
@ -80,7 +58,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
fn eof_err(&mut self) -> PErr<'psess> { fn eof_err(&mut self) -> PErr<'psess> {
let msg = "this file contains an unclosed delimiter"; let msg = "this file contains an unclosed delimiter";
let mut err = self.string_reader.dcx().struct_span_err(self.token.span, msg); let mut err = self.dcx().struct_span_err(self.token.span, msg);
let unclosed_delimiter_show_limit = 5; let unclosed_delimiter_show_limit = 5;
let len = usize::min(unclosed_delimiter_show_limit, self.diag_info.open_braces.len()); let len = usize::min(unclosed_delimiter_show_limit, self.diag_info.open_braces.len());
@ -110,7 +88,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
report_suspicious_mismatch_block( report_suspicious_mismatch_block(
&mut err, &mut err,
&self.diag_info, &self.diag_info,
self.string_reader.psess.source_map(), self.psess.source_map(),
*delim, *delim,
) )
} }
@ -136,7 +114,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
// Expand to cover the entire delimited token tree. // Expand to cover the entire delimited token tree.
let delim_span = DelimSpan::from_pair(pre_span, self.token.span); let delim_span = DelimSpan::from_pair(pre_span, self.token.span);
let sm = self.string_reader.psess.source_map(); let sm = self.psess.source_map();
let close_spacing = match self.token.kind { let close_spacing = match self.token.kind {
// Correct delimiter. // Correct delimiter.
@ -160,7 +138,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
} }
// Move past the closing delimiter. // Move past the closing delimiter.
self.bump(false).1 self.bump_minimal()
} }
// Incorrect delimiter. // Incorrect delimiter.
token::CloseDelim(close_delim) => { token::CloseDelim(close_delim) => {
@ -203,7 +181,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
// bar(baz( // bar(baz(
// } // Incorrect delimiter but matches the earlier `{` // } // Incorrect delimiter but matches the earlier `{`
if !self.diag_info.open_braces.iter().any(|&(b, _)| b == close_delim) { if !self.diag_info.open_braces.iter().any(|&(b, _)| b == close_delim) {
self.bump(false).1 self.bump_minimal()
} else { } else {
// The choice of value here doesn't matter. // The choice of value here doesn't matter.
Spacing::Alone Spacing::Alone
@ -225,14 +203,14 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
} }
// Move on to the next token, returning the current token and its spacing. // Move on to the next token, returning the current token and its spacing.
// Will glue adjacent single-char tokens together if `glue` is set. // Will glue adjacent single-char tokens together.
fn bump(&mut self, glue: bool) -> (Token, Spacing) { fn bump(&mut self) -> (Token, Spacing) {
let (this_spacing, next_tok) = loop { let (this_spacing, next_tok) = loop {
let (next_tok, is_next_tok_preceded_by_whitespace) = self.string_reader.next_token(); let (next_tok, is_next_tok_preceded_by_whitespace) = self.next_token_from_cursor();
if is_next_tok_preceded_by_whitespace { if is_next_tok_preceded_by_whitespace {
break (Spacing::Alone, next_tok); break (Spacing::Alone, next_tok);
} else if glue && let Some(glued) = self.token.glue(&next_tok) { } else if let Some(glued) = self.token.glue(&next_tok) {
self.token = glued; self.token = glued;
} else { } else {
let this_spacing = if next_tok.is_punct() { let this_spacing = if next_tok.is_punct() {
@ -249,6 +227,26 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
(this_tok, this_spacing) (this_tok, this_spacing)
} }
// Cut-down version of `bump` used when the token kind is known in advance.
fn bump_minimal(&mut self) -> Spacing {
let (next_tok, is_next_tok_preceded_by_whitespace) = self.next_token_from_cursor();
let this_spacing = if is_next_tok_preceded_by_whitespace {
Spacing::Alone
} else {
if next_tok.is_punct() {
Spacing::Joint
} else if next_tok == token::Eof {
Spacing::Alone
} else {
Spacing::JointHidden
}
};
self.token = next_tok;
this_spacing
}
fn unclosed_delim_err( fn unclosed_delim_err(
&mut self, &mut self,
tts: TokenStream, tts: TokenStream,
@ -256,7 +254,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
) -> Vec<PErr<'psess>> { ) -> Vec<PErr<'psess>> {
// If there are unclosed delims, see if there are diff markers and if so, point them // If there are unclosed delims, see if there are diff markers and if so, point them
// out instead of complaining about the unclosed delims. // out instead of complaining about the unclosed delims.
let mut parser = Parser::new(self.string_reader.psess, tts, None); let mut parser = Parser::new(self.psess, tts, None);
let mut diff_errs = vec![]; let mut diff_errs = vec![];
// Suggest removing a `{` we think appears in an `if`/`while` condition. // Suggest removing a `{` we think appears in an `if`/`while` condition.
// We want to suggest removing a `{` only if we think we're in an `if`/`while` condition, // We want to suggest removing a `{` only if we think we're in an `if`/`while` condition,
@ -314,14 +312,9 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
// An unexpected closing delimiter (i.e., there is no matching opening delimiter). // An unexpected closing delimiter (i.e., there is no matching opening delimiter).
let token_str = token_to_string(&self.token); let token_str = token_to_string(&self.token);
let msg = format!("unexpected closing delimiter: `{token_str}`"); let msg = format!("unexpected closing delimiter: `{token_str}`");
let mut err = self.string_reader.dcx().struct_span_err(self.token.span, msg); let mut err = self.dcx().struct_span_err(self.token.span, msg);
report_suspicious_mismatch_block( report_suspicious_mismatch_block(&mut err, &self.diag_info, self.psess.source_map(), delim);
&mut err,
&self.diag_info,
self.string_reader.psess.source_map(),
delim,
);
err.span_label(self.token.span, "unexpected closing delimiter"); err.span_label(self.token.span, "unexpected closing delimiter");
err err
} }

View file

@ -4,7 +4,7 @@
use rustc_span::symbol::kw; use rustc_span::symbol::kw;
use rustc_span::{BytePos, Pos, Span}; use rustc_span::{BytePos, Pos, Span};
use super::StringReader; use super::Lexer;
use crate::errors::TokenSubstitution; use crate::errors::TokenSubstitution;
use crate::token::{self, Delimiter}; use crate::token::{self, Delimiter};
@ -338,7 +338,7 @@ const ASCII_ARRAY: &[(&str, &str, Option<token::TokenKind>)] = &[
]; ];
pub(super) fn check_for_substitution( pub(super) fn check_for_substitution(
reader: &StringReader<'_, '_>, lexer: &Lexer<'_, '_>,
pos: BytePos, pos: BytePos,
ch: char, ch: char,
count: usize, count: usize,
@ -351,11 +351,11 @@ pub(super) fn check_for_substitution(
let Some((_, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(s, _, _)| s == ascii_str) else { let Some((_, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(s, _, _)| s == ascii_str) else {
let msg = format!("substitution character not found for '{ch}'"); let msg = format!("substitution character not found for '{ch}'");
reader.dcx().span_bug(span, msg); lexer.dcx().span_bug(span, msg);
}; };
// special help suggestion for "directed" double quotes // special help suggestion for "directed" double quotes
let sugg = if let Some(s) = peek_delimited(&reader.src[reader.src_index(pos)..], '“', '”') { let sugg = if let Some(s) = peek_delimited(&lexer.src[lexer.src_index(pos)..], '“', '”') {
let span = Span::with_root_ctxt( let span = Span::with_root_ctxt(
pos, pos,
pos + Pos::from_usize('“'.len_utf8() + s.len() + '”'.len_utf8()), pos + Pos::from_usize('“'.len_utf8() + s.len() + '”'.len_utf8()),