1
Fork 0

auto merge of #15283 : kwantam/rust/master, r=alexcrichton

Add libunicode; move unicode functions from core

- created new crate, libunicode, below libstd
- split `Char` trait into `Char` (libcore) and `UnicodeChar` (libunicode)
  - Unicode-aware functions now live in libunicode
    - `is_alphabetic`, `is_XID_start`, `is_XID_continue`, `is_lowercase`,
      `is_uppercase`, `is_whitespace`, `is_alphanumeric`, `is_control`, `is_digit`,
      `to_uppercase`, `to_lowercase`
  - added `width` method in UnicodeChar trait
    - determines printed width of character in columns, or None if it is a non-NULL control character
    - takes a boolean argument indicating whether the present context is CJK or not (characters with 'A'mbiguous widths are double-wide in CJK contexts, single-wide otherwise)
- split `StrSlice` into `StrSlice` (libcore) and `UnicodeStrSlice` (libunicode)
  - functionality formerly in `StrSlice` that relied upon Unicode functionality from `Char` is now in `UnicodeStrSlice`
    - `words`, `is_whitespace`, `is_alphanumeric`, `trim`, `trim_left`, `trim_right`
  - also moved `Words` type alias into libunicode because `words` method is in `UnicodeStrSlice`
- unified Unicode tables from libcollections, libcore, and libregex into libunicode
- updated `unicode.py` in `src/etc` to generate aforementioned tables
- generated new tables based on latest Unicode data
- added `UnicodeChar` and `UnicodeStrSlice` traits to prelude
- libunicode is now the collection point for the `std::char` module, combining the libunicode functionality with the `Char` functionality from libcore
  - thus, moved doc comment for `char` from `core::char` to `unicode::char`
- libcollections remains the collection point for `std::str`

The Unicode-aware functions that previously lived in the `Char` and `StrSlice` traits are no longer available to programs that only use libcore. To regain use of these methods, include the libunicode crate and `use` the `UnicodeChar` and/or `UnicodeStrSlice` traits:

    extern crate unicode;
    use unicode::UnicodeChar;
    use unicode::UnicodeStrSlice;
    use unicode::Words; // if you want to use the words() method

NOTE: this does *not* impact programs that use libstd, since UnicodeChar and UnicodeStrSlice have been added to the prelude.

closes #15224
[breaking-change]
This commit is contained in:
bors 2014-07-09 18:36:30 +00:00
commit fa7cbb5a46
27 changed files with 7445 additions and 11597 deletions

View file

@ -22,7 +22,7 @@ use cmp;
use cmp::{PartialEq, Eq};
use collections::Collection;
use default::Default;
use iter::{Filter, Map, Iterator};
use iter::{Map, Iterator};
use iter::{DoubleEndedIterator, ExactSize};
use iter::range;
use num::{CheckedMul, Saturating};
@ -204,10 +204,6 @@ pub struct CharSplitsN<'a, Sep> {
invert: bool,
}
/// An iterator over the words of a string, separated by a sequence of whitespace
pub type Words<'a> =
Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
/// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
pub type AnyLines<'a> =
Map<'a, &'a str, &'a str, CharSplits<'a, char>>;
@ -1209,48 +1205,6 @@ pub trait StrSlice<'a> {
/// ```
fn lines_any(&self) -> AnyLines<'a>;
/// An iterator over the words of a string (subsequences separated
/// by any sequence of whitespace). Sequences of whitespace are
/// collapsed, so empty "words" are not included.
///
/// # Example
///
/// ```rust
/// let some_words = " Mary had\ta little \n\t lamb";
/// let v: Vec<&str> = some_words.words().collect();
/// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]);
/// ```
fn words(&self) -> Words<'a>;
/// Returns true if the string contains only whitespace.
///
/// Whitespace characters are determined by `char::is_whitespace`.
///
/// # Example
///
/// ```rust
/// assert!(" \t\n".is_whitespace());
/// assert!("".is_whitespace());
///
/// assert!( !"abc".is_whitespace());
/// ```
fn is_whitespace(&self) -> bool;
/// Returns true if the string contains only alphanumeric code
/// points.
///
/// Alphanumeric characters are determined by `char::is_alphanumeric`.
///
/// # Example
///
/// ```rust
/// assert!("Löwe老虎Léopard123".is_alphanumeric());
/// assert!("".is_alphanumeric());
///
/// assert!( !" &*~".is_alphanumeric());
/// ```
fn is_alphanumeric(&self) -> bool;
/// Returns the number of Unicode code points (`char`) that a
/// string holds.
///
@ -1368,15 +1322,6 @@ pub trait StrSlice<'a> {
/// Returns true if `needle` is a suffix of the string.
fn ends_with(&self, needle: &str) -> bool;
/// Returns a string with leading and trailing whitespace removed.
fn trim(&self) -> &'a str;
/// Returns a string with leading whitespace removed.
fn trim_left(&self) -> &'a str;
/// Returns a string with trailing whitespace removed.
fn trim_right(&self) -> &'a str;
/// Returns a string with characters that match `to_trim` removed.
///
/// # Arguments
@ -1748,17 +1693,6 @@ impl<'a> StrSlice<'a> for &'a str {
})
}
#[inline]
fn words(&self) -> Words<'a> {
self.split(char::is_whitespace).filter(|s| !s.is_empty())
}
#[inline]
fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) }
#[inline]
fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }
#[inline]
fn char_len(&self) -> uint { self.chars().count() }
@ -1817,21 +1751,6 @@ impl<'a> StrSlice<'a> for &'a str {
m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n)
}
#[inline]
fn trim(&self) -> &'a str {
self.trim_left().trim_right()
}
#[inline]
fn trim_left(&self) -> &'a str {
self.trim_left_chars(char::is_whitespace)
}
#[inline]
fn trim_right(&self) -> &'a str {
self.trim_right_chars(char::is_whitespace)
}
#[inline]
fn trim_chars<C: CharEq>(&self, mut to_trim: C) -> &'a str {
let cur = match self.find(|c: char| !to_trim.matches(c)) {