Implement 2 kinds of char / str escaping. Use in rustc. Close #2306.
This commit is contained in:
parent
c2ce2741a7
commit
57f399bd63
5 changed files with 136 additions and 33 deletions
|
@ -38,7 +38,8 @@ export is_alphabetic,
|
|||
is_lowercase, is_uppercase,
|
||||
is_whitespace, is_alphanumeric,
|
||||
is_ascii, is_digit,
|
||||
to_digit, cmp;
|
||||
to_digit, cmp,
|
||||
escape_default, escape_unicode;
|
||||
|
||||
import is_alphabetic = unicode::derived_property::Alphabetic;
|
||||
import is_XID_start = unicode::derived_property::XID_Start;
|
||||
|
@ -122,6 +123,53 @@ pure fn to_digit(c: char, radix: uint) -> option<uint> {
|
|||
else { none }
|
||||
}
|
||||
|
||||
#[doc = "
|
||||
Return the hexadecimal unicode escape of a char.
|
||||
|
||||
The rules are as follows:
|
||||
|
||||
- chars in [0,0xff] get 2-digit escapes: `\\xNN`
|
||||
- chars in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`
|
||||
- chars above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`
|
||||
"]
|
||||
fn escape_unicode(c: char) -> str {
|
||||
let s = u32::to_str(c as u32, 16u);
|
||||
let (c, pad) = (if c <= '\xff' { ('x', 2u) }
|
||||
else if c <= '\uffff' { ('u', 4u) }
|
||||
else { ('U', 8u) });
|
||||
assert str::len(s) <= pad;
|
||||
let mut out = "\\";
|
||||
out += str::from_char(c);
|
||||
for uint::range(str::len(s), pad) {|_i| out += "0"; }
|
||||
out += s;
|
||||
ret out;
|
||||
}
|
||||
|
||||
#[doc = "
|
||||
Return a 'default' ASCII and C++11-like char-literal escape of a char.
|
||||
|
||||
The default is chosen with a bias toward producing literals that are
|
||||
legal in a variety of languages, including C++11 and similar C-family
|
||||
languages. The exact rules are:
|
||||
|
||||
- Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
|
||||
- Single-quote, double-quote and backslash chars are backslash-escaped.
|
||||
- Any other chars in the range [0x20,0x7e] are not escaped.
|
||||
- Any other chars are given hex unicode escapes; see `escape_unicode`.
|
||||
"]
|
||||
fn escape_default(c: char) -> str {
|
||||
alt c {
|
||||
'\t' { "\\t" }
|
||||
'\r' { "\\r" }
|
||||
'\n' { "\\n" }
|
||||
'\\' { "\\\\" }
|
||||
'\'' { "\\'" }
|
||||
'"' { "\\\"" }
|
||||
'\x20' to '\x7e' { str::from_char(c) }
|
||||
_ { escape_unicode(c) }
|
||||
}
|
||||
}
|
||||
|
||||
#[doc = "
|
||||
Compare two chars
|
||||
|
||||
|
@ -198,3 +246,30 @@ fn test_is_digit() {
|
|||
assert ! is_digit('Q');
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape_default() {
|
||||
assert escape_default('\n') == "\\n";
|
||||
assert escape_default('\r') == "\\r";
|
||||
assert escape_default('\'') == "\\'";
|
||||
assert escape_default('"') == "\\\"";
|
||||
assert escape_default(' ') == " ";
|
||||
assert escape_default('a') == "a";
|
||||
assert escape_default('~') == "~";
|
||||
assert escape_default('\x00') == "\\x00";
|
||||
assert escape_default('\x1f') == "\\x1f";
|
||||
assert escape_default('\x7f') == "\\x7f";
|
||||
assert escape_default('\xff') == "\\xff";
|
||||
assert escape_default('\u011b') == "\\u011b";
|
||||
assert escape_default('\U0001d4b6') == "\\U0001d4b6";
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_escape_unicode() {
|
||||
assert escape_unicode('\x00') == "\\x00";
|
||||
assert escape_unicode('\n') == "\\x0a";
|
||||
assert escape_unicode(' ') == "\\x20";
|
||||
assert escape_unicode('a') == "\\x61";
|
||||
assert escape_unicode('\u011b') == "\\u011b";
|
||||
assert escape_unicode('\U0001d4b6') == "\\U0001d4b6";
|
||||
}
|
||||
|
|
|
@ -97,6 +97,8 @@ export
|
|||
reserve,
|
||||
reserve_at_least,
|
||||
capacity,
|
||||
escape_default,
|
||||
escape_unicode,
|
||||
|
||||
unsafe,
|
||||
extensions;
|
||||
|
@ -1625,6 +1627,22 @@ fn capacity(&&s: str) -> uint unsafe {
|
|||
}
|
||||
}
|
||||
|
||||
#[doc = "Escape each char in `s` with char::escape_default."]
|
||||
fn escape_default(s: str) -> str {
|
||||
let mut out: str = "";
|
||||
reserve_at_least(out, str::len(s));
|
||||
chars_iter(s) {|c| out += char::escape_default(c); }
|
||||
ret out;
|
||||
}
|
||||
|
||||
#[doc = "Escape each char in `s` with char::escape_unicode."]
|
||||
fn escape_unicode(s: str) -> str {
|
||||
let mut out: str = "";
|
||||
reserve_at_least(out, str::len(s));
|
||||
chars_iter(s) {|c| out += char::escape_unicode(c); }
|
||||
ret out;
|
||||
}
|
||||
|
||||
#[doc = "Unsafe operations"]
|
||||
mod unsafe {
|
||||
export
|
||||
|
@ -1866,6 +1884,12 @@ impl extensions for str {
|
|||
#[doc = "Returns a string with trailing whitespace removed"]
|
||||
#[inline]
|
||||
fn trim_right() -> str { trim_right(self) }
|
||||
#[doc = "Escape each char in `s` with char::escape_default."]
|
||||
#[inline]
|
||||
fn escape_default() -> str { escape_default(self) }
|
||||
#[doc = "Escape each char in `s` with char::escape_unicode."]
|
||||
#[inline]
|
||||
fn escape_unicode() -> str { escape_unicode(self) }
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -2748,4 +2772,32 @@ mod tests {
|
|||
assert *ptr::offset(buf,5u) == 0u8;
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape_unicode() {
|
||||
assert escape_unicode("abc") == "\\x61\\x62\\x63";
|
||||
assert escape_unicode("a c") == "\\x61\\x20\\x63";
|
||||
assert escape_unicode("\r\n\t") == "\\x0d\\x0a\\x09";
|
||||
assert escape_unicode("'\"\\") == "\\x27\\x22\\x5c";
|
||||
assert escape_unicode("\x00\x01\xfe\xff") == "\\x00\\x01\\xfe\\xff";
|
||||
assert escape_unicode("\u0100\uffff") == "\\u0100\\uffff";
|
||||
assert escape_unicode("\U00010000\U0010ffff") ==
|
||||
"\\U00010000\\U0010ffff";
|
||||
assert escape_unicode("ab\ufb00") == "\\x61\\x62\\ufb00";
|
||||
assert escape_unicode("\U0001d4ea\r") == "\\U0001d4ea\\x0d";
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape_default() {
|
||||
assert escape_default("abc") == "abc";
|
||||
assert escape_default("a c") == "a c";
|
||||
assert escape_default("\r\n\t") == "\\r\\n\\t";
|
||||
assert escape_default("'\"\\") == "\\'\\\"\\\\";
|
||||
assert escape_default("\u0100\uffff") == "\\u0100\\uffff";
|
||||
assert escape_default("\U00010000\U0010ffff") ==
|
||||
"\\U00010000\\U0010ffff";
|
||||
assert escape_default("ab\ufb00") == "ab\\ufb00";
|
||||
assert escape_default("\U0001d4ea\r") == "\\U0001d4ea\\r";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -434,6 +434,7 @@ fn next_token_inner(rdr: reader) -> token::token {
|
|||
't' { c2 = '\t'; }
|
||||
'\\' { c2 = '\\'; }
|
||||
'\'' { c2 = '\''; }
|
||||
'"' { c2 = '"'; }
|
||||
'x' { c2 = scan_numeric_escape(rdr, 2u); }
|
||||
'u' { c2 = scan_numeric_escape(rdr, 4u); }
|
||||
'U' { c2 = scan_numeric_escape(rdr, 8u); }
|
||||
|
|
|
@ -122,11 +122,7 @@ fn to_str(in: interner<str>, t: token) -> str {
|
|||
|
||||
/* Literals */
|
||||
LIT_INT(c, ast::ty_char) {
|
||||
// FIXME: escape.
|
||||
let mut tmp = "'";
|
||||
str::push_char(tmp, c as char);
|
||||
str::push_char(tmp, '\'');
|
||||
ret tmp;
|
||||
ret "'" + char::escape_default(c as char) + "'";
|
||||
}
|
||||
LIT_INT(i, t) {
|
||||
ret int::to_str(i as int, 10u) + ast_util::int_ty_to_str(t);
|
||||
|
@ -138,10 +134,11 @@ fn to_str(in: interner<str>, t: token) -> str {
|
|||
ret interner::get::<str>(in, s) +
|
||||
ast_util::float_ty_to_str(t);
|
||||
}
|
||||
LIT_STR(s) { // FIXME: escape.
|
||||
ret "\"" + interner::get::<str>(in, s) + "\"";
|
||||
LIT_STR(s) {
|
||||
ret "\""
|
||||
+ str::escape_default(interner::get::<str>(in, s))
|
||||
+ "\"";
|
||||
}
|
||||
|
||||
/* Name components */
|
||||
IDENT(s, _) {
|
||||
ret interner::get::<str>(in, s);
|
||||
|
|
|
@ -1609,7 +1609,7 @@ fn print_literal(s: ps, &&lit: @ast::lit) {
|
|||
alt lit.node {
|
||||
ast::lit_str(st) { print_string(s, st); }
|
||||
ast::lit_int(ch, ast::ty_char) {
|
||||
word(s.s, "'" + escape_str(str::from_char(ch as char), '\'') + "'");
|
||||
word(s.s, "'" + char::escape_default(ch as char) + "'");
|
||||
}
|
||||
ast::lit_int(i, t) {
|
||||
if i < 0_i64 {
|
||||
|
@ -1714,32 +1714,10 @@ fn print_comment(s: ps, cmnt: comments::cmnt) {
|
|||
|
||||
fn print_string(s: ps, st: str) {
|
||||
word(s.s, "\"");
|
||||
word(s.s, escape_str(st, '"'));
|
||||
word(s.s, str::escape_default(st));
|
||||
word(s.s, "\"");
|
||||
}
|
||||
|
||||
fn escape_str(st: str, to_escape: char) -> str {
|
||||
let mut out: str = "";
|
||||
let len = str::len(st);
|
||||
let mut i = 0u;
|
||||
while i < len {
|
||||
alt st[i] as char {
|
||||
'\n' { out += "\\n"; }
|
||||
'\t' { out += "\\t"; }
|
||||
'\r' { out += "\\r"; }
|
||||
'\\' { out += "\\\\"; }
|
||||
cur {
|
||||
if cur == to_escape { out += "\\"; }
|
||||
// FIXME some (or all?) non-ascii things should be escaped
|
||||
// (See #2306)
|
||||
str::push_char(out, cur);
|
||||
}
|
||||
}
|
||||
i += 1u;
|
||||
}
|
||||
ret out;
|
||||
}
|
||||
|
||||
fn to_str<T>(t: T, f: fn@(ps, T)) -> str {
|
||||
let buffer = io::mem_buffer();
|
||||
let s = rust_printer(io::mem_buffer_writer(buffer));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue