lexer: lex WS/COMMENT/SHEBANG rather than skipping

Now, the lexer will categorize every byte in its input according to the
grammar. The parser skips over these while parsing, thus avoiding their
presence in the input to syntax extensions.
This commit is contained in:
Corey Richardson 2014-07-04 22:30:39 -07:00
parent cc4213418e
commit f512779554
6 changed files with 133 additions and 86 deletions

View file

@ -325,10 +325,24 @@ fn is_plain_ident_or_underscore(t: &token::Token) -> bool {
is_plain_ident(t) || *t == token::UNDERSCORE
}
/// Get a token the parser cares about
fn real_token(rdr: &mut Reader) -> TokenAndSpan {
let mut t = rdr.next_token();
loop {
match t.tok {
token::WS | token::COMMENT | token::SHEBANG(_) => {
t = rdr.next_token();
},
_ => break
}
}
t
}
impl<'a> Parser<'a> {
pub fn new(sess: &'a ParseSess, cfg: ast::CrateConfig,
mut rdr: Box<Reader>) -> Parser<'a> {
let tok0 = rdr.next_token();
let tok0 = real_token(rdr);
let span = tok0.sp;
let placeholder = TokenAndSpan {
tok: token::UNDERSCORE,
@ -864,7 +878,7 @@ impl<'a> Parser<'a> {
None
};
let next = if self.buffer_start == self.buffer_end {
self.reader.next_token()
real_token(self.reader)
} else {
// Avoid token copies with `replace`.
let buffer_start = self.buffer_start as uint;
@ -908,7 +922,7 @@ impl<'a> Parser<'a> {
-> R {
let dist = distance as int;
while self.buffer_length() < dist {
self.buffer[self.buffer_end as uint] = self.reader.next_token();
self.buffer[self.buffer_end as uint] = real_token(self.reader);
self.buffer_end = (self.buffer_end + 1) & 3;
}
f(&self.buffer[((self.buffer_start + dist - 1) & 3) as uint].tok)