Implement 2 kinds of char / str escaping. Use in rustc. Close #2306.

2012-05-31 15:31:13 -07:00 · 2012-05-31 15:31:13 -07:00 · 57f399bd63
commit 57f399bd63
parent c2ce2741a7
5 changed files with 136 additions and 33 deletions
--- a/src/libcore/char.rs
+++ b/src/libcore/char.rs
@ -38,7 +38,8 @@ export is_alphabetic,
       is_lowercase, is_uppercase,
       is_whitespace, is_alphanumeric,
       is_ascii, is_digit,
-       to_digit, cmp;
+       to_digit, cmp,
+       escape_default, escape_unicode;

 import is_alphabetic = unicode::derived_property::Alphabetic;
 import is_XID_start = unicode::derived_property::XID_Start;
@ -122,6 +123,53 @@ pure fn to_digit(c: char, radix: uint) -> option<uint> {
    else { none }
 }

+#[doc = "
+Return the hexadecimal unicode escape of a char.
+
+The rules are as follows:
+
+  - chars in [0,0xff] get 2-digit escapes: `\\xNN`
+  - chars in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`
+  - chars above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`
+"]
+fn escape_unicode(c: char) -> str {
+    let s = u32::to_str(c as u32, 16u);
+    let (c, pad) = (if c <= '\xff' { ('x', 2u) }
+                    else if c <= '\uffff' { ('u', 4u) }
+                    else { ('U', 8u) });
+    assert str::len(s) <= pad;
+    let mut out = "\\";
+    out += str::from_char(c);
+    for uint::range(str::len(s), pad) {|_i| out += "0"; }
+    out += s;
+    ret out;
+}
+
+#[doc = "
+Return a 'default' ASCII and C++11-like char-literal escape of a char.
+
+The default is chosen with a bias toward producing literals that are
+legal in a variety of languages, including C++11 and similar C-family
+languages. The exact rules are:
+
+  - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
+  - Single-quote, double-quote and backslash chars are backslash-escaped.
+  - Any other chars in the range [0x20,0x7e] are not escaped.
+  - Any other chars are given hex unicode escapes; see `escape_unicode`.
+"]
+fn escape_default(c: char) -> str {
+    alt c {
+      '\t' { "\\t" }
+      '\r' { "\\r" }
+      '\n' { "\\n" }
+      '\\' { "\\\\" }
+      '\'' { "\\'" }
+      '"' { "\\\"" }
+      '\x20' to '\x7e' { str::from_char(c) }
+      _ { escape_unicode(c) }
+    }
+}
+
 #[doc = "
 Compare two chars

@ -198,3 +246,30 @@ fn test_is_digit() {
   assert ! is_digit('Q');
 }

+#[test]
+fn test_escape_default() {
+    assert escape_default('\n') == "\\n";
+    assert escape_default('\r') == "\\r";
+    assert escape_default('\'') == "\\'";
+    assert escape_default('"') == "\\\"";
+    assert escape_default(' ') == " ";
+    assert escape_default('a') == "a";
+    assert escape_default('~') == "~";
+    assert escape_default('\x00') == "\\x00";
+    assert escape_default('\x1f') == "\\x1f";
+    assert escape_default('\x7f') == "\\x7f";
+    assert escape_default('\xff') == "\\xff";
+    assert escape_default('\u011b') == "\\u011b";
+    assert escape_default('\U0001d4b6') == "\\U0001d4b6";
+}
+
+
+#[test]
+fn test_escape_unicode() {
+    assert escape_unicode('\x00') == "\\x00";
+    assert escape_unicode('\n') == "\\x0a";
+    assert escape_unicode(' ') == "\\x20";
+    assert escape_unicode('a') == "\\x61";
+    assert escape_unicode('\u011b') == "\\u011b";
+    assert escape_unicode('\U0001d4b6') == "\\U0001d4b6";
+}
--- a/src/libcore/str.rs
+++ b/src/libcore/str.rs
@ -97,6 +97,8 @@ export
   reserve,
   reserve_at_least,
   capacity,
+   escape_default,
+   escape_unicode,

   unsafe,
   extensions;
@ -1625,6 +1627,22 @@ fn capacity(&&s: str) -> uint unsafe {
    }
 }

+#[doc = "Escape each char in `s` with char::escape_default."]
+fn escape_default(s: str) -> str {
+    let mut out: str = "";
+    reserve_at_least(out, str::len(s));
+    chars_iter(s) {|c| out += char::escape_default(c); }
+    ret out;
+}
+
+#[doc = "Escape each char in `s` with char::escape_unicode."]
+fn escape_unicode(s: str) -> str {
+    let mut out: str = "";
+    reserve_at_least(out, str::len(s));
+    chars_iter(s) {|c| out += char::escape_unicode(c); }
+    ret out;
+}
+
 #[doc = "Unsafe operations"]
 mod unsafe {
   export
@ -1866,6 +1884,12 @@ impl extensions for str {
    #[doc = "Returns a string with trailing whitespace removed"]
    #[inline]
    fn trim_right() -> str { trim_right(self) }
+    #[doc = "Escape each char in `s` with char::escape_default."]
+    #[inline]
+    fn escape_default() -> str { escape_default(self) }
+    #[doc = "Escape each char in `s` with char::escape_unicode."]
+    #[inline]
+    fn escape_unicode() -> str { escape_unicode(self) }
 }

 #[cfg(test)]
@ -2748,4 +2772,32 @@ mod tests {
            assert *ptr::offset(buf,5u) == 0u8;
        }
    }
+
+    #[test]
+    fn test_escape_unicode() {
+        assert escape_unicode("abc") == "\\x61\\x62\\x63";
+        assert escape_unicode("a c") == "\\x61\\x20\\x63";
+        assert escape_unicode("\r\n\t") == "\\x0d\\x0a\\x09";
+        assert escape_unicode("'\"\\") == "\\x27\\x22\\x5c";
+        assert escape_unicode("\x00\x01\xfe\xff") == "\\x00\\x01\\xfe\\xff";
+        assert escape_unicode("\u0100\uffff") == "\\u0100\\uffff";
+        assert escape_unicode("\U00010000\U0010ffff") ==
+            "\\U00010000\\U0010ffff";
+        assert escape_unicode("ab\ufb00") == "\\x61\\x62\\ufb00";
+        assert escape_unicode("\U0001d4ea\r") == "\\U0001d4ea\\x0d";
+    }
+
+    #[test]
+    fn test_escape_default() {
+        assert escape_default("abc") == "abc";
+        assert escape_default("a c") == "a c";
+        assert escape_default("\r\n\t") == "\\r\\n\\t";
+        assert escape_default("'\"\\") == "\\'\\\"\\\\";
+        assert escape_default("\u0100\uffff") == "\\u0100\\uffff";
+        assert escape_default("\U00010000\U0010ffff") ==
+            "\\U00010000\\U0010ffff";
+        assert escape_default("ab\ufb00") == "ab\\ufb00";
+        assert escape_default("\U0001d4ea\r") == "\\U0001d4ea\\r";
+    }
+
 }
--- a/src/libsyntax/parse/lexer.rs
+++ b/src/libsyntax/parse/lexer.rs
@ -434,6 +434,7 @@ fn next_token_inner(rdr: reader) -> token::token {
              't' { c2 = '\t'; }
              '\\' { c2 = '\\'; }
              '\'' { c2 = '\''; }
+              '"' { c2 = '"'; }
              'x' { c2 = scan_numeric_escape(rdr, 2u); }
              'u' { c2 = scan_numeric_escape(rdr, 4u); }
              'U' { c2 = scan_numeric_escape(rdr, 8u); }
--- a/src/libsyntax/parse/token.rs
+++ b/src/libsyntax/parse/token.rs
@ -122,11 +122,7 @@ fn to_str(in: interner<str>, t: token) -> str {

      /* Literals */
      LIT_INT(c, ast::ty_char) {
-        // FIXME: escape.
-        let mut tmp = "'";
-        str::push_char(tmp, c as char);
-        str::push_char(tmp, '\'');
-        ret tmp;
+        ret "'" + char::escape_default(c as char) + "'";
      }
      LIT_INT(i, t) {
        ret int::to_str(i as int, 10u) + ast_util::int_ty_to_str(t);
@ -138,10 +134,11 @@ fn to_str(in: interner<str>, t: token) -> str {
        ret interner::get::<str>(in, s) +
            ast_util::float_ty_to_str(t);
      }
-      LIT_STR(s) { // FIXME: escape.
-        ret "\"" + interner::get::<str>(in, s) + "\"";
+      LIT_STR(s) {
+        ret "\""
+            + str::escape_default(interner::get::<str>(in, s))
+            + "\"";
      }
-
      /* Name components */
      IDENT(s, _) {
        ret interner::get::<str>(in, s);
--- a/src/libsyntax/print/pprust.rs
+++ b/src/libsyntax/print/pprust.rs
@ -1609,7 +1609,7 @@ fn print_literal(s: ps, &&lit: @ast::lit) {
    alt lit.node {
      ast::lit_str(st) { print_string(s, st); }
      ast::lit_int(ch, ast::ty_char) {
-        word(s.s, "'" + escape_str(str::from_char(ch as char), '\'') + "'");
+        word(s.s, "'" + char::escape_default(ch as char) + "'");
      }
      ast::lit_int(i, t) {
        if i < 0_i64 {
@ -1714,32 +1714,10 @@ fn print_comment(s: ps, cmnt: comments::cmnt) {

 fn print_string(s: ps, st: str) {
    word(s.s, "\"");
-    word(s.s, escape_str(st, '"'));
+    word(s.s, str::escape_default(st));
    word(s.s, "\"");
 }

-fn escape_str(st: str, to_escape: char) -> str {
-    let mut out: str = "";
-    let len = str::len(st);
-    let mut i = 0u;
-    while i < len {
-        alt st[i] as char {
-          '\n' { out += "\\n"; }
-          '\t' { out += "\\t"; }
-          '\r' { out += "\\r"; }
-          '\\' { out += "\\\\"; }
-          cur {
-            if cur == to_escape { out += "\\"; }
-            // FIXME some (or all?) non-ascii things should be escaped
-            // (See #2306)
-            str::push_char(out, cur);
-          }
-        }
-        i += 1u;
-    }
-    ret out;
-}
-
 fn to_str<T>(t: T, f: fn@(ps, T)) -> str {
    let buffer = io::mem_buffer();
    let s = rust_printer(io::mem_buffer_writer(buffer));