auto merge of #5559 : jbclements/rust/change-to-tt-based-parsing, r=jbclements

Changes the parser to parse all streams into token-trees before hitting the parser proper, in preparation for hygiene. As an added bonus, it appears to speed up the parser (albeit by a totally imperceptible 1%). Also, many comments in the parser. Also, field renaming in token-trees (readme->forest, cur->stack).
2013-04-03 11:31:03 -07:00 · 2013-04-03 11:31:03 -07:00 · 6153aae809
commit 6153aae809
parent 260d74dfcc f2e47cddf8
6 changed files with 124 additions and 35 deletions
--- a/src/librustc/driver/driver.rs
+++ b/src/librustc/driver/driver.rs
@ -151,7 +151,7 @@ pub fn parse_input(sess: Session, +cfg: ast::crate_cfg, input: input)
    -> @ast::crate {
    match input {
      file_input(ref file) => {
-        parse::parse_crate_from_file(&(*file), cfg, sess.parse_sess)
+        parse::parse_crate_from_file_using_tts(&(*file), cfg, sess.parse_sess)
      }
      str_input(ref src) => {
        // FIXME (#2319): Don't really want to box the source string
--- a/src/libsyntax/ext/tt/transcribe.rs
+++ b/src/libsyntax/ext/tt/transcribe.rs
@ -26,7 +26,7 @@ use core::vec;
   `~` */
 ///an unzipping of `token_tree`s
 struct TtFrame {
-    readme: @mut ~[ast::token_tree],
+    forest: @mut ~[ast::token_tree],
    idx: uint,
    dotdotdoted: bool,
    sep: Option<Token>,
@ -37,7 +37,7 @@ pub struct TtReader {
    sp_diag: @span_handler,
    interner: @ident_interner,
    // the unzipped tree:
-    cur: @mut TtFrame,
+    stack: @mut TtFrame,
    /* for MBE-style macro transcription */
    interpolations: LinearMap<ident, @named_match>,
    repeat_idx: ~[uint],
@ -58,8 +58,8 @@ pub fn new_tt_reader(sp_diag: @span_handler,
    let r = @mut TtReader {
        sp_diag: sp_diag,
        interner: itr,
-        cur: @mut TtFrame {
-            readme: @mut src,
+        stack: @mut TtFrame {
+            forest: @mut src,
            idx: 0u,
            dotdotdoted: false,
            sep: None,
@ -81,7 +81,7 @@ pub fn new_tt_reader(sp_diag: @span_handler,

 fn dup_tt_frame(f: @mut TtFrame) -> @mut TtFrame {
    @mut TtFrame {
-        readme: @mut (copy *f.readme),
+        forest: @mut (copy *f.forest),
        idx: f.idx,
        dotdotdoted: f.dotdotdoted,
        sep: copy f.sep,
@ -96,7 +96,7 @@ pub fn dup_tt_reader(r: @mut TtReader) -> @mut TtReader {
    @mut TtReader {
        sp_diag: r.sp_diag,
        interner: r.interner,
-        cur: dup_tt_frame(r.cur),
+        stack: dup_tt_frame(r.stack),
        interpolations: r.interpolations,
        repeat_idx: copy r.repeat_idx,
        repeat_len: copy r.repeat_len,
@ -167,7 +167,8 @@ fn lockstep_iter_size(t: token_tree, r: &mut TtReader) -> lis {
    }
 }

-
+// return the next token from the TtReader.
+// EFFECT: advances the reader's token field
 pub fn tt_next_token(r: &mut TtReader) -> TokenAndSpan {
    let ret_val = TokenAndSpan {
        tok: copy r.cur_tok,
@ -175,37 +176,37 @@ pub fn tt_next_token(r: &mut TtReader) -> TokenAndSpan {
    };
    loop {
        {
-            let cur = &mut *r.cur;
-            let readme = &mut *cur.readme;
-            if cur.idx < readme.len() {
+            let stack = &mut *r.stack;
+            let forest = &mut *stack.forest;
+            if stack.idx < forest.len() {
                break;
            }
        }

        /* done with this set; pop or repeat? */
-        if ! r.cur.dotdotdoted
+        if ! r.stack.dotdotdoted
            || { *r.repeat_idx.last() == *r.repeat_len.last() - 1 } {

-            match r.cur.up {
+            match r.stack.up {
              None => {
                r.cur_tok = EOF;
                return ret_val;
              }
              Some(tt_f) => {
-                if r.cur.dotdotdoted {
+                if r.stack.dotdotdoted {
                    r.repeat_idx.pop();
                    r.repeat_len.pop();
                }

-                r.cur = tt_f;
-                r.cur.idx += 1u;
+                r.stack = tt_f;
+                r.stack.idx += 1u;
              }
            }

        } else { /* repeat */
-            r.cur.idx = 0u;
+            r.stack.idx = 0u;
            r.repeat_idx[r.repeat_idx.len() - 1u] += 1u;
-            match r.cur.sep {
+            match r.stack.sep {
              Some(copy tk) => {
                r.cur_tok = tk; /* repeat same span, I guess */
                return ret_val;
@ -216,21 +217,21 @@ pub fn tt_next_token(r: &mut TtReader) -> TokenAndSpan {
    }
    loop { /* because it's easiest, this handles `tt_delim` not starting
    with a `tt_tok`, even though it won't happen */
-        match r.cur.readme[r.cur.idx] {
+        match r.stack.forest[r.stack.idx] {
          tt_delim(copy tts) => {
-            r.cur = @mut TtFrame {
-                readme: @mut tts,
+            r.stack = @mut TtFrame {
+                forest: @mut tts,
                idx: 0u,
                dotdotdoted: false,
                sep: None,
-                up: option::Some(r.cur)
+                up: option::Some(r.stack)
            };
            // if this could be 0-length, we'd need to potentially recur here
          }
          tt_tok(sp, copy tok) => {
            r.cur_span = sp;
            r.cur_tok = tok;
-            r.cur.idx += 1u;
+            r.stack.idx += 1u;
            return ret_val;
          }
          tt_seq(sp, copy tts, copy sep, zerok) => {
@ -256,17 +257,17 @@ pub fn tt_next_token(r: &mut TtReader) -> TokenAndSpan {
                                               once");
                          }

-                    r.cur.idx += 1u;
+                    r.stack.idx += 1u;
                    return tt_next_token(r);
                } else {
                    r.repeat_len.push(len);
                    r.repeat_idx.push(0u);
-                    r.cur = @mut TtFrame {
-                        readme: @mut tts,
+                    r.stack = @mut TtFrame {
+                        forest: @mut tts,
                        idx: 0u,
                        dotdotdoted: true,
                        sep: sep,
-                        up: Some(r.cur)
+                        up: Some(r.stack)
                    };
                }
              }
@ -280,13 +281,13 @@ pub fn tt_next_token(r: &mut TtReader) -> TokenAndSpan {
              (b) we actually can, since it's a token. */
              matched_nonterminal(nt_ident(sn,b)) => {
                r.cur_span = sp; r.cur_tok = IDENT(sn,b);
-                r.cur.idx += 1u;
+                r.stack.idx += 1u;
                return ret_val;
              }
              matched_nonterminal(ref other_whole_nt) => {
                r.cur_span = sp;
                r.cur_tok = INTERPOLATED(copy *other_whole_nt);
-                r.cur.idx += 1u;
+                r.stack.idx += 1u;
                return ret_val;
              }
              matched_seq(*) => {
--- a/src/libsyntax/parse/common.rs
+++ b/src/libsyntax/parse/common.rs
@ -159,6 +159,9 @@ pub impl Parser {
        }
    }

+    // if the given word is not a keyword, signal an error.
+    // if the next token is the given keyword, eat it and return
+    // true. Otherwise, return false.
    fn eat_keyword(&self, word: &~str) -> bool {
        self.require_keyword(word);
        let is_kw = match *self.token {
@ -169,6 +172,9 @@ pub impl Parser {
        is_kw
    }

+    // if the given word is not a keyword, signal an error.
+    // if the next token is not the given word, signal an error.
+    // otherwise, eat it.
    fn expect_keyword(&self, word: &~str) {
        self.require_keyword(word);
        if !self.eat_keyword(word) {
@ -182,10 +188,12 @@ pub impl Parser {
        }
    }

+    // return true if the given string is a strict keyword
    fn is_strict_keyword(&self, word: &~str) -> bool {
        self.strict_keywords.contains(word)
    }

+    // signal an error if the current token is a strict keyword
    fn check_strict_keywords(&self) {
        match *self.token {
            token::IDENT(_, false) => {
@ -196,16 +204,19 @@ pub impl Parser {
        }
    }

+    // signal an error if the given string is a strict keyword
    fn check_strict_keywords_(&self, w: &~str) {
        if self.is_strict_keyword(w) {
            self.fatal(fmt!("found `%s` in ident position", *w));
        }
    }

+    // return true if this is a reserved keyword
    fn is_reserved_keyword(&self, word: &~str) -> bool {
        self.reserved_keywords.contains(word)
    }

+    // signal an error if the current token is a reserved keyword
    fn check_reserved_keywords(&self) {
        match *self.token {
            token::IDENT(_, false) => {
@ -216,6 +227,7 @@ pub impl Parser {
        }
    }

+    // signal an error if the given string is a reserved keyword
    fn check_reserved_keywords_(&self, w: &~str) {
        if self.is_reserved_keyword(w) {
            self.fatal(fmt!("`%s` is a reserved keyword", *w));
@ -223,7 +235,8 @@ pub impl Parser {
    }

    // expect and consume a GT. if a >> is seen, replace it
-    // with a single > and continue.
+    // with a single > and continue. If a GT is not seen,
+    // signal an error.
    fn expect_gt(&self) {
        if *self.token == token::GT {
            self.bump();
--- a/src/libsyntax/parse/lexer.rs
+++ b/src/libsyntax/parse/lexer.rs
@ -80,7 +80,8 @@ pub fn new_low_level_string_reader(span_diagnostic: @span_handler,
        last_pos: filemap.start_pos,
        col: CharPos(0),
        curr: initial_char,
-        filemap: filemap, interner: itr,
+        filemap: filemap,
+        interner: itr,
        /* dummy values; not read */
        peek_tok: token::EOF,
        peek_span: codemap::dummy_sp()
@ -150,6 +151,7 @@ impl reader for TtReader {
 }

 // EFFECT: advance peek_tok and peek_span to refer to the next token.
+// EFFECT: update the interner, maybe.
 fn string_advance_token(r: @mut StringReader) {
    match (consume_whitespace_and_comments(r)) {
        Some(comment) => {
@ -539,6 +541,9 @@ fn ident_continue(c: char) -> bool {
        || (c > 'z' && char::is_XID_continue(c))
 }

+// return the next token from the string
+// EFFECT: advances the input past that token
+// EFFECT: updates the interner
 fn next_token_inner(rdr: @mut StringReader) -> token::Token {
    let mut accum_str = ~"";
    let mut c = rdr.curr;
--- a/src/libsyntax/parse/mod.rs
+++ b/src/libsyntax/parse/mod.rs
@ -45,10 +45,14 @@ pub mod classify;
 /// Reporting obsolete syntax
 pub mod obsolete;

+// info about a parsing session.
+// This structure and the reader both have
+// an interner associated with them. If they're
+// not the same, bad things can happen.
 pub struct ParseSess {
-    cm: @codemap::CodeMap,
+    cm: @codemap::CodeMap, // better be the same as the one in the reader!
    next_id: node_id,
-    span_diagnostic: @span_handler,
+    span_diagnostic: @span_handler, // better be the same as the one in the reader!
    interner: @ident_interner,
 }

@ -90,6 +94,19 @@ pub fn parse_crate_from_file(
    // why is there no p.abort_if_errors here?
 }

+pub fn parse_crate_from_file_using_tts(
+    input: &Path,
+    cfg: ast::crate_cfg,
+    sess: @mut ParseSess
+) -> @ast::crate {
+    let p = new_parser_from_file(sess, /*bad*/ copy cfg, input);
+    let tts = p.parse_all_token_trees();
+    new_parser_from_tts(sess,cfg,tts).parse_crate_mod(/*bad*/ copy cfg)
+    // why is there no p.abort_if_errors here?
+}
+
+
+
 pub fn parse_crate_from_source_str(
    name: ~str,
    source: @~str,
@ -313,6 +330,7 @@ mod test {
    use std;
    use core::io;
    use core::option::None;
+    use ast;

    #[test] fn to_json_str<E : Encodable<std::json::Encoder>>(val: @E) -> ~str {
        do io::with_str_writer |writer| {
@ -320,10 +338,38 @@ mod test {
        }
    }

-    #[test] fn alltts () {
+    fn string_to_crate (source_str : @~str) -> @ast::crate {
+        parse_crate_from_source_str(
+            ~"bogofile",
+            source_str,
+            ~[],
+            new_parse_sess(None))
+    }
+
+    fn string_to_tt_to_crate (source_str : @~str) -> @ast::crate {
        let tts = parse_tts_from_source_str(
            ~"bogofile",
-            @~"fn foo (x : int) { x; }",
+           source_str,
+           ~[],
+           new_parse_sess(None));
+        new_parser_from_tts(new_parse_sess(None),~[],tts)
+            .parse_crate_mod(~[])
+    }
+
+    // make sure that parsing from TTs produces the same result
+    // as parsing from strings
+    #[test] fn tts_produce_the_same_result () {
+        let source_str = @~"fn foo (x : int) { x; }";
+        assert_eq!(string_to_tt_to_crate(source_str),
+                     string_to_crate(source_str));
+    }
+
+    // check the contents of the tt manually:
+    #[test] fn alltts () {
+        let source_str = @~"fn foo (x : int) { x; }";
+        let tts = parse_tts_from_source_str(
+            ~"bogofile",
+            source_str,
            ~[],
            new_parse_sess(None));
        assert_eq!(
--- a/src/libsyntax/parse/parser.rs
+++ b/src/libsyntax/parse/parser.rs
@ -248,6 +248,7 @@ pub fn Parser(sess: @mut ParseSess,
    }
 }

+// ooh, nasty mutable fields everywhere....
 pub struct Parser {
    sess: @mut ParseSess,
    cfg: crate_cfg,
@ -340,6 +341,7 @@ pub impl Parser {
        self.sess.interner.get(id)
    }

+    // is this one of the keywords that signals a closure type?
    fn token_is_closure_keyword(&self, tok: &token::Token) -> bool {
        self.token_is_keyword(&~"pure", tok) ||
            self.token_is_keyword(&~"unsafe", tok) ||
@ -347,6 +349,7 @@ pub impl Parser {
            self.token_is_keyword(&~"fn", tok)
    }

+    // parse a ty_bare_fun type:
    fn parse_ty_bare_fn(&self) -> ty_
    {
        /*
@ -376,6 +379,7 @@ pub impl Parser {
        });
    }

+    // parse a ty_closure type
    fn parse_ty_closure(&self,
                        sigil: ast::Sigil,
                        region: Option<@ast::Lifetime>) -> ty_
@ -434,6 +438,7 @@ pub impl Parser {
        }
    }

+    // parse a function type (following the 'fn')
    fn parse_ty_fn_decl(&self) -> (fn_decl, OptVec<ast::Lifetime>) {
        /*

@ -545,12 +550,14 @@ pub impl Parser {
    }


+    // parse a possibly mutable type
    fn parse_mt(&self) -> mt {
        let mutbl = self.parse_mutability();
        let t = self.parse_ty(false);
        mt { ty: t, mutbl: mutbl }
    }

+    // parse [mut/const/imm] ID : TY
    fn parse_ty_field(&self) -> ty_field {
        let lo = self.span.lo;
        let mutbl = self.parse_mutability();
@ -567,6 +574,7 @@ pub impl Parser {
        )
    }

+    // parse optional return type [ -> TY ] in function decl
    fn parse_ret_ty(&self) -> (ret_style, @Ty) {
        return if self.eat(&token::RARROW) {
            let lo = self.span.lo;
@ -595,6 +603,7 @@ pub impl Parser {
        }
    }

+    // parse a type.
    // Useless second parameter for compatibility with quasiquote macros.
    // Bleh!
    fn parse_ty(&self, _: bool) -> @Ty {
@ -631,15 +640,19 @@ pub impl Parser {
                t
            }
        } else if *self.token == token::AT {
+            // MANAGED POINTER
            self.bump();
            self.parse_box_or_uniq_pointee(ManagedSigil, ty_box)
        } else if *self.token == token::TILDE {
+            // OWNED POINTER
            self.bump();
            self.parse_box_or_uniq_pointee(OwnedSigil, ty_uniq)
        } else if *self.token == token::BINOP(token::STAR) {
+            // STAR POINTER (bare pointer?)
            self.bump();
            ty_ptr(self.parse_mt())
        } else if *self.token == token::LBRACE {
+            // STRUCTURAL RECORD (remove?)
            let elems = self.parse_unspanned_seq(
                &token::LBRACE,
                &token::RBRACE,
@ -652,6 +665,7 @@ pub impl Parser {
            self.obsolete(*self.last_span, ObsoleteRecordType);
            ty_nil
        } else if *self.token == token::LBRACKET {
+            // VECTOR
            self.expect(&token::LBRACKET);
            let mt = self.parse_mt();
            if mt.mutbl == m_mutbl {    // `m_const` too after snapshot
@ -667,16 +681,20 @@ pub impl Parser {
            self.expect(&token::RBRACKET);
            t
        } else if *self.token == token::BINOP(token::AND) {
+            // BORROWED POINTER
            self.bump();
            self.parse_borrowed_pointee()
        } else if self.eat_keyword(&~"extern") {
+            // EXTERN FUNCTION
            self.parse_ty_bare_fn()
        } else if self.token_is_closure_keyword(&copy *self.token) {
+            // CLOSURE
            let result = self.parse_ty_closure(ast::BorrowedSigil, None);
            self.obsolete(*self.last_span, ObsoleteBareFnType);
            result
        } else if *self.token == token::MOD_SEP
            || is_ident_or_path(&*self.token) {
+            // NAMED TYPE
            let path = self.parse_path_with_tps(false);
            ty_path(path, self.get_id())
        } else {
@ -885,6 +903,8 @@ pub impl Parser {
        let global = self.eat(&token::MOD_SEP);
        let mut ids = ~[];
        loop {
+            // if there's a ::< coming, stop processing
+            // the path.
            let is_not_last =
                self.look_ahead(2u) != token::LT
                && self.look_ahead(1u) == token::MOD_SEP;
@ -904,6 +924,9 @@ pub impl Parser {
                     types: ~[] }
    }

+    // parse a path optionally with type parameters. If 'colons'
+    // is true, then type parameters must be preceded by colons,
+    // as in a::t::<t1,t2>
    fn parse_path_with_tps(&self, colons: bool) -> @ast::path {
        debug!("parse_path_with_tps(colons=%b)", colons);

@ -1071,6 +1094,7 @@ pub impl Parser {
        self.token_is_keyword(&~"const", tok)
    }

+    // parse mutability declaration (mut/const/imm)
    fn parse_mutability(&self) -> mutability {
        if self.eat_keyword(&~"mut") {
            m_mutbl