From d8b0a1910ab424a17b065138234baea47285a7a0 Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Sun, 15 Jan 2012 20:20:06 -0800 Subject: [PATCH] Added string functions: split_func, split_char, lines, lines_any, words, and more tests --- src/libcore/str.rs | 106 ++++++++++++++++++++++++++++++++++++---- src/test/stdtest/str.rs | 92 ++++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+), 9 deletions(-) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index cba9b11cb3d..4816d913c46 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -7,7 +7,8 @@ String manipulation. export eq, lteq, hash, is_empty, is_not_empty, is_whitespace, byte_len, byte_len_range, index, rindex, find, starts_with, ends_with, substr, slice, split, splitn, - split_str, concat, connect, to_lower, to_upper, replace, char_slice, + split_str, split_func, split_char, lines, lines_any, words, + concat, connect, to_lower, to_upper, replace, char_slice, trim_left, trim_right, trim, unshift_char, shift_char, pop_char, push_char, is_utf8, from_chars, to_chars, char_len, char_len_range, char_at, bytes, is_ascii, shift_byte, pop_byte, @@ -252,7 +253,7 @@ fn from_chars(chs: [char]) -> str { /* Function: utf8_char_width -FIXME: What does this function do? +Given a first byte, determine how many bytes are in this UTF-8 character */ pure fn utf8_char_width(b: u8) -> uint { let byte: uint = b as uint; @@ -275,15 +276,27 @@ Pluck a character out of a string and return the index of the next character. This function can be used to iterate over the unicode characters of a string. Example: - -> let s = "Clam chowder, hot sauce, pork rinds"; -> let i = 0; -> while i < len(s) { -> let {ch, next} = char_range_at(s, i); -> log(debug, ch); -> i = next; +> let s = "中华Việt Nam"; +> let i = 0u; +> while i < str::byte_len(s) { +> let {ch, next} = str::char_range_at(s, i); +> std::io::println(#fmt("%u: %c",i,ch)); +> i = next; > } +Example output: + + 0: 中 + 3: 华 + 6: V + 7: i + 8: ệ + 11: t + 12: + 13: N + 14: a + 15: m + Parameters: s - The string @@ -721,6 +734,8 @@ Split a string at each occurance of a given separator Returns: A vector containing all the strings between each occurance of the separator + +FIXME: should be renamed to split_byte */ fn split(s: str, sep: u8) -> [str] { let v: [str] = []; @@ -772,6 +787,9 @@ leading fields are suppressed, and empty trailing fields are preserved. Returns: A vector containing all the strings between each occurrence of the separator. + +FIXME: should behave like split and split_char: + assert ["", "XXX", "YYY", ""] == split_str(".XXX.YYY.", "."); */ fn split_str(s: str, sep: str) -> [str] { assert byte_len(sep) > 0u; @@ -799,6 +817,76 @@ fn split_str(s: str, sep: str) -> [str] { ret v; } +/* +Function: split_func + +Splits a string into substrings using a function +(unicode safe) + +FIXME: will be renamed to split. +*/ +fn split_func(ss: str, sepfn: fn&(cc: char)->bool) -> [str] { + let vv: [str] = []; + let accum: str = ""; + let ends_with_sep: bool = false; + + str::iter_chars(ss, {|cc| if sepfn(cc) { + vv += [accum]; + accum = ""; + ends_with_sep = true; + } else { + str::push_char(accum, cc); + ends_with_sep = false; + } + }); + + if char_len(accum) >= 0u || ends_with_sep { + vv += [accum]; + } + + ret vv; +} + +/* +Function: split_char + +Splits a string into a vector of the substrings separated by a given character +*/ +fn split_char(ss: str, cc: char) -> [str] { + split_func(ss, {|kk| kk == cc}) +} + +/* +Function: lines + +Splits a string into a vector of the substrings +separated by LF ('\n') +*/ +fn lines(ss: str) -> [str] { + split_func(ss, {|cc| cc == '\n'}) +} + +/* +Function: lines_any + +Splits a string into a vector of the substrings +separated by LF ('\n') and/or CR LF ('\r\n') +*/ +fn lines_any(ss: str) -> [str] { + vec::map(lines(ss), {|s| trim_right(s)}) +} + +/* +Function: words + +Splits a string into a vector of the substrings +separated by whitespace +*/ +fn words(ss: str) -> [str] { + ret vec::filter( split_func(ss, {|cc| char::is_whitespace(cc)}), + {|w| 0u < str::char_len(w)}); +} + /* Function: concat diff --git a/src/test/stdtest/str.rs b/src/test/stdtest/str.rs index 39217fb8a8b..26b5c2e9df7 100644 --- a/src/test/stdtest/str.rs +++ b/src/test/stdtest/str.rs @@ -80,12 +80,83 @@ fn test_split_str() { let v = str::split_str(s, sep); assert str::eq(v[i], k); } + + //FIXME: should behave like split and split_char: + //assert ["", "XXX", "YYY", ""] == str::split_str(".XXX.YYY.", "."); + t("abc::hello::there", "::", 0, "abc"); t("abc::hello::there", "::", 1, "hello"); t("abc::hello::there", "::", 2, "there"); t("::hello::there", "::", 0, "hello"); t("hello::there::", "::", 2, ""); t("::hello::there::", "::", 2, ""); + t("ประเทศไทย中华Việt Nam", "中华", 0, "ประเทศไทย"); + t("ประเทศไทย中华Việt Nam", "中华", 1, "Việt Nam"); +} + +#[test] +fn test_split_func () { + let data = "ประเทศไทย中华Việt Nam"; + assert ["ประเทศไทย中", "Việt Nam"] + == str::split_func (data, {|cc| cc == '华'}); + + assert ["", "", "XXX", "YYY", ""] + == str::split_func("zzXXXzYYYz", char::is_lowercase); + + assert ["zz", "", "", "z", "", "", "z"] + == str::split_func("zzXXXzYYYz", char::is_uppercase); + + assert ["",""] == str::split_func("z", {|cc| cc == 'z'}); + assert [""] == str::split_func("", {|cc| cc == 'z'}); + assert ["ok"] == str::split_func("ok", {|cc| cc == 'z'}); +} + +#[test] +fn test_split_char () { + let data = "ประเทศไทย中华Việt Nam"; + assert ["ประเทศไทย中", "Việt Nam"] + == str::split_char(data, '华'); + + assert ["", "", "XXX", "YYY", ""] + == str::split_char("zzXXXzYYYz", 'z'); + assert ["",""] == str::split_char("z", 'z'); + assert [""] == str::split_char("", 'z'); + assert ["ok"] == str::split_char("ok", 'z'); +} + +#[test] +fn test_lines () { + let lf = "\nMary had a little lamb\nLittle lamb\n"; + let crlf = "\r\nMary had a little lamb\r\nLittle lamb\r\n"; + + assert ["", "Mary had a little lamb", "Little lamb", ""] + == str::lines(lf); + + assert ["", "Mary had a little lamb", "Little lamb", ""] + == str::lines_any(lf); + + assert ["\r", "Mary had a little lamb\r", "Little lamb\r", ""] + == str::lines(crlf); + + assert ["", "Mary had a little lamb", "Little lamb", ""] + == str::lines_any(crlf); + + assert [""] == str::lines (""); + assert [""] == str::lines_any(""); + assert ["",""] == str::lines ("\n"); + assert ["",""] == str::lines_any("\n"); + assert ["banana"] == str::lines ("banana"); + assert ["banana"] == str::lines_any("banana"); +} + +#[test] +fn test_words () { + let data = "\nMary had a little lamb\nLittle lamb\n"; + assert ["Mary","had","a","little","lamb","Little","lamb"] + == str::words(data); + + assert ["ok"] == str::words("ok"); + assert [] == str::words(""); } #[test] @@ -215,6 +286,27 @@ fn test_char_slice() { assert (str::eq("bc", str::char_slice("abc", 1u, 3u))); assert (str::eq("", str::char_slice("abc", 1u, 1u))); assert (str::eq("\u65e5", str::char_slice("\u65e5\u672c", 0u, 1u))); + + let data = "ประเทศไทย中华"; + assert (str::eq("ป", str::char_slice(data, 0u, 1u))); + assert (str::eq("ร", str::char_slice(data, 1u, 2u))); + assert (str::eq("华", str::char_slice(data, 10u, 11u))); + assert (str::eq("", str::char_slice(data, 1u, 1u))); + + fn a_million_letter_X() -> str { + let i = 0; + let rs = ""; + while i < 100000 { rs += "华华华华华华华华华华"; i += 1; } + ret rs; + } + fn half_a_million_letter_X() -> str { + let i = 0; + let rs = ""; + while i < 100000 { rs += "华华华华华"; i += 1; } + ret rs; + } + assert (str::eq(half_a_million_letter_X(), + str::char_slice(a_million_letter_X(), 0u, 500000u))); } #[test]