1
Fork 0

Implement 2 kinds of char / str escaping. Use in rustc. Close #2306.

This commit is contained in:
Graydon Hoare 2012-05-31 15:31:13 -07:00
parent c2ce2741a7
commit 57f399bd63
5 changed files with 136 additions and 33 deletions

View file

@ -38,7 +38,8 @@ export is_alphabetic,
is_lowercase, is_uppercase,
is_whitespace, is_alphanumeric,
is_ascii, is_digit,
to_digit, cmp;
to_digit, cmp,
escape_default, escape_unicode;
import is_alphabetic = unicode::derived_property::Alphabetic;
import is_XID_start = unicode::derived_property::XID_Start;
@ -122,6 +123,53 @@ pure fn to_digit(c: char, radix: uint) -> option<uint> {
else { none }
}
#[doc = "
Return the hexadecimal unicode escape of a char.
The rules are as follows:
- chars in [0,0xff] get 2-digit escapes: `\\xNN`
- chars in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`
- chars above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`
"]
fn escape_unicode(c: char) -> str {
let s = u32::to_str(c as u32, 16u);
let (c, pad) = (if c <= '\xff' { ('x', 2u) }
else if c <= '\uffff' { ('u', 4u) }
else { ('U', 8u) });
assert str::len(s) <= pad;
let mut out = "\\";
out += str::from_char(c);
for uint::range(str::len(s), pad) {|_i| out += "0"; }
out += s;
ret out;
}
#[doc = "
Return a 'default' ASCII and C++11-like char-literal escape of a char.
The default is chosen with a bias toward producing literals that are
legal in a variety of languages, including C++11 and similar C-family
languages. The exact rules are:
- Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
- Single-quote, double-quote and backslash chars are backslash-escaped.
- Any other chars in the range [0x20,0x7e] are not escaped.
- Any other chars are given hex unicode escapes; see `escape_unicode`.
"]
fn escape_default(c: char) -> str {
alt c {
'\t' { "\\t" }
'\r' { "\\r" }
'\n' { "\\n" }
'\\' { "\\\\" }
'\'' { "\\'" }
'"' { "\\\"" }
'\x20' to '\x7e' { str::from_char(c) }
_ { escape_unicode(c) }
}
}
#[doc = "
Compare two chars
@ -198,3 +246,30 @@ fn test_is_digit() {
assert ! is_digit('Q');
}
#[test]
fn test_escape_default() {
assert escape_default('\n') == "\\n";
assert escape_default('\r') == "\\r";
assert escape_default('\'') == "\\'";
assert escape_default('"') == "\\\"";
assert escape_default(' ') == " ";
assert escape_default('a') == "a";
assert escape_default('~') == "~";
assert escape_default('\x00') == "\\x00";
assert escape_default('\x1f') == "\\x1f";
assert escape_default('\x7f') == "\\x7f";
assert escape_default('\xff') == "\\xff";
assert escape_default('\u011b') == "\\u011b";
assert escape_default('\U0001d4b6') == "\\U0001d4b6";
}
#[test]
fn test_escape_unicode() {
assert escape_unicode('\x00') == "\\x00";
assert escape_unicode('\n') == "\\x0a";
assert escape_unicode(' ') == "\\x20";
assert escape_unicode('a') == "\\x61";
assert escape_unicode('\u011b') == "\\u011b";
assert escape_unicode('\U0001d4b6') == "\\U0001d4b6";
}

View file

@ -97,6 +97,8 @@ export
reserve,
reserve_at_least,
capacity,
escape_default,
escape_unicode,
unsafe,
extensions;
@ -1625,6 +1627,22 @@ fn capacity(&&s: str) -> uint unsafe {
}
}
#[doc = "Escape each char in `s` with char::escape_default."]
fn escape_default(s: str) -> str {
let mut out: str = "";
reserve_at_least(out, str::len(s));
chars_iter(s) {|c| out += char::escape_default(c); }
ret out;
}
#[doc = "Escape each char in `s` with char::escape_unicode."]
fn escape_unicode(s: str) -> str {
let mut out: str = "";
reserve_at_least(out, str::len(s));
chars_iter(s) {|c| out += char::escape_unicode(c); }
ret out;
}
#[doc = "Unsafe operations"]
mod unsafe {
export
@ -1866,6 +1884,12 @@ impl extensions for str {
#[doc = "Returns a string with trailing whitespace removed"]
#[inline]
fn trim_right() -> str { trim_right(self) }
#[doc = "Escape each char in `s` with char::escape_default."]
#[inline]
fn escape_default() -> str { escape_default(self) }
#[doc = "Escape each char in `s` with char::escape_unicode."]
#[inline]
fn escape_unicode() -> str { escape_unicode(self) }
}
#[cfg(test)]
@ -2748,4 +2772,32 @@ mod tests {
assert *ptr::offset(buf,5u) == 0u8;
}
}
#[test]
fn test_escape_unicode() {
assert escape_unicode("abc") == "\\x61\\x62\\x63";
assert escape_unicode("a c") == "\\x61\\x20\\x63";
assert escape_unicode("\r\n\t") == "\\x0d\\x0a\\x09";
assert escape_unicode("'\"\\") == "\\x27\\x22\\x5c";
assert escape_unicode("\x00\x01\xfe\xff") == "\\x00\\x01\\xfe\\xff";
assert escape_unicode("\u0100\uffff") == "\\u0100\\uffff";
assert escape_unicode("\U00010000\U0010ffff") ==
"\\U00010000\\U0010ffff";
assert escape_unicode("ab\ufb00") == "\\x61\\x62\\ufb00";
assert escape_unicode("\U0001d4ea\r") == "\\U0001d4ea\\x0d";
}
#[test]
fn test_escape_default() {
assert escape_default("abc") == "abc";
assert escape_default("a c") == "a c";
assert escape_default("\r\n\t") == "\\r\\n\\t";
assert escape_default("'\"\\") == "\\'\\\"\\\\";
assert escape_default("\u0100\uffff") == "\\u0100\\uffff";
assert escape_default("\U00010000\U0010ffff") ==
"\\U00010000\\U0010ffff";
assert escape_default("ab\ufb00") == "ab\\ufb00";
assert escape_default("\U0001d4ea\r") == "\\U0001d4ea\\r";
}
}

View file

@ -434,6 +434,7 @@ fn next_token_inner(rdr: reader) -> token::token {
't' { c2 = '\t'; }
'\\' { c2 = '\\'; }
'\'' { c2 = '\''; }
'"' { c2 = '"'; }
'x' { c2 = scan_numeric_escape(rdr, 2u); }
'u' { c2 = scan_numeric_escape(rdr, 4u); }
'U' { c2 = scan_numeric_escape(rdr, 8u); }

View file

@ -122,11 +122,7 @@ fn to_str(in: interner<str>, t: token) -> str {
/* Literals */
LIT_INT(c, ast::ty_char) {
// FIXME: escape.
let mut tmp = "'";
str::push_char(tmp, c as char);
str::push_char(tmp, '\'');
ret tmp;
ret "'" + char::escape_default(c as char) + "'";
}
LIT_INT(i, t) {
ret int::to_str(i as int, 10u) + ast_util::int_ty_to_str(t);
@ -138,10 +134,11 @@ fn to_str(in: interner<str>, t: token) -> str {
ret interner::get::<str>(in, s) +
ast_util::float_ty_to_str(t);
}
LIT_STR(s) { // FIXME: escape.
ret "\"" + interner::get::<str>(in, s) + "\"";
LIT_STR(s) {
ret "\""
+ str::escape_default(interner::get::<str>(in, s))
+ "\"";
}
/* Name components */
IDENT(s, _) {
ret interner::get::<str>(in, s);

View file

@ -1609,7 +1609,7 @@ fn print_literal(s: ps, &&lit: @ast::lit) {
alt lit.node {
ast::lit_str(st) { print_string(s, st); }
ast::lit_int(ch, ast::ty_char) {
word(s.s, "'" + escape_str(str::from_char(ch as char), '\'') + "'");
word(s.s, "'" + char::escape_default(ch as char) + "'");
}
ast::lit_int(i, t) {
if i < 0_i64 {
@ -1714,32 +1714,10 @@ fn print_comment(s: ps, cmnt: comments::cmnt) {
fn print_string(s: ps, st: str) {
word(s.s, "\"");
word(s.s, escape_str(st, '"'));
word(s.s, str::escape_default(st));
word(s.s, "\"");
}
fn escape_str(st: str, to_escape: char) -> str {
let mut out: str = "";
let len = str::len(st);
let mut i = 0u;
while i < len {
alt st[i] as char {
'\n' { out += "\\n"; }
'\t' { out += "\\t"; }
'\r' { out += "\\r"; }
'\\' { out += "\\\\"; }
cur {
if cur == to_escape { out += "\\"; }
// FIXME some (or all?) non-ascii things should be escaped
// (See #2306)
str::push_char(out, cur);
}
}
i += 1u;
}
ret out;
}
fn to_str<T>(t: T, f: fn@(ps, T)) -> str {
let buffer = io::mem_buffer();
let s = rust_printer(io::mem_buffer_writer(buffer));