From 57f399bd638c211005e8565609e928db33ebf864 Mon Sep 17 00:00:00 2001 From: Graydon Hoare Date: Thu, 31 May 2012 15:31:13 -0700 Subject: [PATCH] Implement 2 kinds of char / str escaping. Use in rustc. Close #2306. --- src/libcore/char.rs | 77 ++++++++++++++++++++++++++++++++++- src/libcore/str.rs | 52 +++++++++++++++++++++++ src/libsyntax/parse/lexer.rs | 1 + src/libsyntax/parse/token.rs | 13 +++--- src/libsyntax/print/pprust.rs | 26 +----------- 5 files changed, 136 insertions(+), 33 deletions(-) diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 910c28409a1..fdc44e02696 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -38,7 +38,8 @@ export is_alphabetic, is_lowercase, is_uppercase, is_whitespace, is_alphanumeric, is_ascii, is_digit, - to_digit, cmp; + to_digit, cmp, + escape_default, escape_unicode; import is_alphabetic = unicode::derived_property::Alphabetic; import is_XID_start = unicode::derived_property::XID_Start; @@ -122,6 +123,53 @@ pure fn to_digit(c: char, radix: uint) -> option { else { none } } +#[doc = " +Return the hexadecimal unicode escape of a char. + +The rules are as follows: + + - chars in [0,0xff] get 2-digit escapes: `\\xNN` + - chars in [0x100,0xffff] get 4-digit escapes: `\\uNNNN` + - chars above 0x10000 get 8-digit escapes: `\\UNNNNNNNN` +"] +fn escape_unicode(c: char) -> str { + let s = u32::to_str(c as u32, 16u); + let (c, pad) = (if c <= '\xff' { ('x', 2u) } + else if c <= '\uffff' { ('u', 4u) } + else { ('U', 8u) }); + assert str::len(s) <= pad; + let mut out = "\\"; + out += str::from_char(c); + for uint::range(str::len(s), pad) {|_i| out += "0"; } + out += s; + ret out; +} + +#[doc = " +Return a 'default' ASCII and C++11-like char-literal escape of a char. + +The default is chosen with a bias toward producing literals that are +legal in a variety of languages, including C++11 and similar C-family +languages. The exact rules are: + + - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively. + - Single-quote, double-quote and backslash chars are backslash-escaped. + - Any other chars in the range [0x20,0x7e] are not escaped. + - Any other chars are given hex unicode escapes; see `escape_unicode`. +"] +fn escape_default(c: char) -> str { + alt c { + '\t' { "\\t" } + '\r' { "\\r" } + '\n' { "\\n" } + '\\' { "\\\\" } + '\'' { "\\'" } + '"' { "\\\"" } + '\x20' to '\x7e' { str::from_char(c) } + _ { escape_unicode(c) } + } +} + #[doc = " Compare two chars @@ -198,3 +246,30 @@ fn test_is_digit() { assert ! is_digit('Q'); } +#[test] +fn test_escape_default() { + assert escape_default('\n') == "\\n"; + assert escape_default('\r') == "\\r"; + assert escape_default('\'') == "\\'"; + assert escape_default('"') == "\\\""; + assert escape_default(' ') == " "; + assert escape_default('a') == "a"; + assert escape_default('~') == "~"; + assert escape_default('\x00') == "\\x00"; + assert escape_default('\x1f') == "\\x1f"; + assert escape_default('\x7f') == "\\x7f"; + assert escape_default('\xff') == "\\xff"; + assert escape_default('\u011b') == "\\u011b"; + assert escape_default('\U0001d4b6') == "\\U0001d4b6"; +} + + +#[test] +fn test_escape_unicode() { + assert escape_unicode('\x00') == "\\x00"; + assert escape_unicode('\n') == "\\x0a"; + assert escape_unicode(' ') == "\\x20"; + assert escape_unicode('a') == "\\x61"; + assert escape_unicode('\u011b') == "\\u011b"; + assert escape_unicode('\U0001d4b6') == "\\U0001d4b6"; +} diff --git a/src/libcore/str.rs b/src/libcore/str.rs index 71d7c69485f..27bc1586273 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -97,6 +97,8 @@ export reserve, reserve_at_least, capacity, + escape_default, + escape_unicode, unsafe, extensions; @@ -1625,6 +1627,22 @@ fn capacity(&&s: str) -> uint unsafe { } } +#[doc = "Escape each char in `s` with char::escape_default."] +fn escape_default(s: str) -> str { + let mut out: str = ""; + reserve_at_least(out, str::len(s)); + chars_iter(s) {|c| out += char::escape_default(c); } + ret out; +} + +#[doc = "Escape each char in `s` with char::escape_unicode."] +fn escape_unicode(s: str) -> str { + let mut out: str = ""; + reserve_at_least(out, str::len(s)); + chars_iter(s) {|c| out += char::escape_unicode(c); } + ret out; +} + #[doc = "Unsafe operations"] mod unsafe { export @@ -1866,6 +1884,12 @@ impl extensions for str { #[doc = "Returns a string with trailing whitespace removed"] #[inline] fn trim_right() -> str { trim_right(self) } + #[doc = "Escape each char in `s` with char::escape_default."] + #[inline] + fn escape_default() -> str { escape_default(self) } + #[doc = "Escape each char in `s` with char::escape_unicode."] + #[inline] + fn escape_unicode() -> str { escape_unicode(self) } } #[cfg(test)] @@ -2748,4 +2772,32 @@ mod tests { assert *ptr::offset(buf,5u) == 0u8; } } + + #[test] + fn test_escape_unicode() { + assert escape_unicode("abc") == "\\x61\\x62\\x63"; + assert escape_unicode("a c") == "\\x61\\x20\\x63"; + assert escape_unicode("\r\n\t") == "\\x0d\\x0a\\x09"; + assert escape_unicode("'\"\\") == "\\x27\\x22\\x5c"; + assert escape_unicode("\x00\x01\xfe\xff") == "\\x00\\x01\\xfe\\xff"; + assert escape_unicode("\u0100\uffff") == "\\u0100\\uffff"; + assert escape_unicode("\U00010000\U0010ffff") == + "\\U00010000\\U0010ffff"; + assert escape_unicode("ab\ufb00") == "\\x61\\x62\\ufb00"; + assert escape_unicode("\U0001d4ea\r") == "\\U0001d4ea\\x0d"; + } + + #[test] + fn test_escape_default() { + assert escape_default("abc") == "abc"; + assert escape_default("a c") == "a c"; + assert escape_default("\r\n\t") == "\\r\\n\\t"; + assert escape_default("'\"\\") == "\\'\\\"\\\\"; + assert escape_default("\u0100\uffff") == "\\u0100\\uffff"; + assert escape_default("\U00010000\U0010ffff") == + "\\U00010000\\U0010ffff"; + assert escape_default("ab\ufb00") == "ab\\ufb00"; + assert escape_default("\U0001d4ea\r") == "\\U0001d4ea\\r"; + } + } diff --git a/src/libsyntax/parse/lexer.rs b/src/libsyntax/parse/lexer.rs index 63dc85e865d..bef5f6d1fd8 100644 --- a/src/libsyntax/parse/lexer.rs +++ b/src/libsyntax/parse/lexer.rs @@ -434,6 +434,7 @@ fn next_token_inner(rdr: reader) -> token::token { 't' { c2 = '\t'; } '\\' { c2 = '\\'; } '\'' { c2 = '\''; } + '"' { c2 = '"'; } 'x' { c2 = scan_numeric_escape(rdr, 2u); } 'u' { c2 = scan_numeric_escape(rdr, 4u); } 'U' { c2 = scan_numeric_escape(rdr, 8u); } diff --git a/src/libsyntax/parse/token.rs b/src/libsyntax/parse/token.rs index 1c6f240cf82..044509a5144 100644 --- a/src/libsyntax/parse/token.rs +++ b/src/libsyntax/parse/token.rs @@ -122,11 +122,7 @@ fn to_str(in: interner, t: token) -> str { /* Literals */ LIT_INT(c, ast::ty_char) { - // FIXME: escape. - let mut tmp = "'"; - str::push_char(tmp, c as char); - str::push_char(tmp, '\''); - ret tmp; + ret "'" + char::escape_default(c as char) + "'"; } LIT_INT(i, t) { ret int::to_str(i as int, 10u) + ast_util::int_ty_to_str(t); @@ -138,10 +134,11 @@ fn to_str(in: interner, t: token) -> str { ret interner::get::(in, s) + ast_util::float_ty_to_str(t); } - LIT_STR(s) { // FIXME: escape. - ret "\"" + interner::get::(in, s) + "\""; + LIT_STR(s) { + ret "\"" + + str::escape_default(interner::get::(in, s)) + + "\""; } - /* Name components */ IDENT(s, _) { ret interner::get::(in, s); diff --git a/src/libsyntax/print/pprust.rs b/src/libsyntax/print/pprust.rs index 8206bfd2a4a..dd935533b7f 100644 --- a/src/libsyntax/print/pprust.rs +++ b/src/libsyntax/print/pprust.rs @@ -1609,7 +1609,7 @@ fn print_literal(s: ps, &&lit: @ast::lit) { alt lit.node { ast::lit_str(st) { print_string(s, st); } ast::lit_int(ch, ast::ty_char) { - word(s.s, "'" + escape_str(str::from_char(ch as char), '\'') + "'"); + word(s.s, "'" + char::escape_default(ch as char) + "'"); } ast::lit_int(i, t) { if i < 0_i64 { @@ -1714,32 +1714,10 @@ fn print_comment(s: ps, cmnt: comments::cmnt) { fn print_string(s: ps, st: str) { word(s.s, "\""); - word(s.s, escape_str(st, '"')); + word(s.s, str::escape_default(st)); word(s.s, "\""); } -fn escape_str(st: str, to_escape: char) -> str { - let mut out: str = ""; - let len = str::len(st); - let mut i = 0u; - while i < len { - alt st[i] as char { - '\n' { out += "\\n"; } - '\t' { out += "\\t"; } - '\r' { out += "\\r"; } - '\\' { out += "\\\\"; } - cur { - if cur == to_escape { out += "\\"; } - // FIXME some (or all?) non-ascii things should be escaped - // (See #2306) - str::push_char(out, cur); - } - } - i += 1u; - } - ret out; -} - fn to_str(t: T, f: fn@(ps, T)) -> str { let buffer = io::mem_buffer(); let s = rust_printer(io::mem_buffer_writer(buffer));