core::str: Implement Chars iterator using slice::Items
Re-use the vector iterator to implement the chars iterator. The iterator uses our guarantee that the string contains valid UTF-8, but its only unsafe code is transmuting the decoded u32 into char.
This commit is contained in:
parent
d6b42c2463
commit
42357d772b
1 changed files with 116 additions and 46 deletions
|
@ -97,47 +97,121 @@ impl<'a> CharEq for &'a [char] {
|
||||||
Section: Iterators
|
Section: Iterators
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/// External iterator for a string's characters.
|
/// Iterator for the char (representing *Unicode Scalar Values*) of a string
|
||||||
/// Use with the `std::iter` module.
|
///
|
||||||
|
/// Created with the method `.chars()`.
|
||||||
#[deriving(Clone)]
|
#[deriving(Clone)]
|
||||||
pub struct Chars<'a> {
|
pub struct Chars<'a> {
|
||||||
/// The slice remaining to be iterated
|
iter: slice::Items<'a, u8>
|
||||||
string: &'a str,
|
}
|
||||||
|
|
||||||
|
// Return the initial codepoint accumulator for the first byte.
|
||||||
|
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
|
||||||
|
// for width 3, and 3 bits for width 4
|
||||||
|
macro_rules! utf8_first_byte(
|
||||||
|
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
|
||||||
|
)
|
||||||
|
|
||||||
|
// return the value of $ch updated with continuation byte $byte
|
||||||
|
macro_rules! utf8_acc_cont_byte(
|
||||||
|
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
|
||||||
|
)
|
||||||
|
|
||||||
|
macro_rules! utf8_is_cont_byte(
|
||||||
|
($byte:expr) => (($byte & 192u8) == 128)
|
||||||
|
)
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn unwrap_or_0(opt: Option<&u8>) -> u8 {
|
||||||
|
match opt {
|
||||||
|
Some(&byte) => byte,
|
||||||
|
None => 0,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Iterator<char> for Chars<'a> {
|
impl<'a> Iterator<char> for Chars<'a> {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn next(&mut self) -> Option<char> {
|
fn next(&mut self) -> Option<char> {
|
||||||
// Decode the next codepoint, then update
|
// Decode UTF-8, using the valid UTF-8 invariant
|
||||||
// the slice to be just the remaining part
|
#[inline]
|
||||||
if self.string.len() != 0 {
|
fn decode_multibyte<'a>(x: u8, it: &mut slice::Items<'a, u8>) -> char {
|
||||||
let CharRange {ch, next} = self.string.char_range_at(0);
|
// NOTE: Performance is very sensitive to the exact formulation here
|
||||||
unsafe {
|
// Decode from a byte combination out of: [[[x y] z] w]
|
||||||
self.string = raw::slice_unchecked(self.string, next, self.string.len());
|
let cont_mask = 0x3F; // continuation byte mask
|
||||||
|
let init = utf8_first_byte!(x, 2);
|
||||||
|
let y = unwrap_or_0(it.next());
|
||||||
|
let mut ch = utf8_acc_cont_byte!(init, y);
|
||||||
|
if x >= 0xE0 {
|
||||||
|
/* [[x y z] w] case */
|
||||||
|
let z = unwrap_or_0(it.next());
|
||||||
|
|
||||||
|
let y_z = (((y & cont_mask) as u32) << 6) | (z & cont_mask) as u32;
|
||||||
|
ch = init << 12 | y_z;
|
||||||
|
if x >= 0xF0 {
|
||||||
|
/* [x y z w] case */
|
||||||
|
let w = unwrap_or_0(it.next());
|
||||||
|
ch = (init & 7) << 18 | y_z << 6 | (w & cont_mask) as u32;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
unsafe {
|
||||||
|
mem::transmute(ch)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match self.iter.next() {
|
||||||
|
None => None,
|
||||||
|
Some(&next_byte) => {
|
||||||
|
if next_byte < 128 {
|
||||||
|
Some(next_byte as char)
|
||||||
|
} else {
|
||||||
|
Some(decode_multibyte(next_byte, &mut self.iter))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Some(ch)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn size_hint(&self) -> (uint, Option<uint>) {
|
fn size_hint(&self) -> (uint, Option<uint>) {
|
||||||
(self.string.len().saturating_add(3)/4, Some(self.string.len()))
|
let (len, _) = self.iter.size_hint();
|
||||||
|
(len.saturating_add(3) / 4, Some(len))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> DoubleEndedIterator<char> for Chars<'a> {
|
impl<'a> DoubleEndedIterator<char> for Chars<'a> {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn next_back(&mut self) -> Option<char> {
|
fn next_back(&mut self) -> Option<char> {
|
||||||
if self.string.len() != 0 {
|
#[inline]
|
||||||
let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len());
|
fn decode_multibyte_back<'a>(w: u8, it: &mut slice::Items<'a, u8>) -> char {
|
||||||
unsafe {
|
// Decode from a byte combination out of: [x [y [z w]]]
|
||||||
self.string = raw::slice_unchecked(self.string, 0, next);
|
let mut ch;
|
||||||
|
let z = unwrap_or_0(it.next_back());
|
||||||
|
ch = utf8_first_byte!(z, 2);
|
||||||
|
if utf8_is_cont_byte!(z) {
|
||||||
|
let y = unwrap_or_0(it.next_back());
|
||||||
|
ch = utf8_first_byte!(y, 3);
|
||||||
|
if utf8_is_cont_byte!(y) {
|
||||||
|
let x = unwrap_or_0(it.next_back());
|
||||||
|
ch = utf8_first_byte!(x, 4);
|
||||||
|
ch = utf8_acc_cont_byte!(ch, y);
|
||||||
|
}
|
||||||
|
ch = utf8_acc_cont_byte!(ch, z);
|
||||||
|
}
|
||||||
|
ch = utf8_acc_cont_byte!(ch, w);
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
mem::transmute(ch)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match self.iter.next_back() {
|
||||||
|
None => None,
|
||||||
|
Some(&back_byte) => {
|
||||||
|
if back_byte < 128 {
|
||||||
|
Some(back_byte as char)
|
||||||
|
} else {
|
||||||
|
Some(decode_multibyte_back(back_byte, &mut self.iter))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Some(ch)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -146,18 +220,23 @@ impl<'a> DoubleEndedIterator<char> for Chars<'a> {
|
||||||
/// Use with the `std::iter` module.
|
/// Use with the `std::iter` module.
|
||||||
#[deriving(Clone)]
|
#[deriving(Clone)]
|
||||||
pub struct CharOffsets<'a> {
|
pub struct CharOffsets<'a> {
|
||||||
/// The original string to be iterated
|
front: uint,
|
||||||
string: &'a str,
|
back: uint,
|
||||||
iter: Chars<'a>,
|
iter: Chars<'a>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
|
impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn next(&mut self) -> Option<(uint, char)> {
|
fn next(&mut self) -> Option<(uint, char)> {
|
||||||
// Compute the byte offset by using the pointer offset between
|
match self.iter.next() {
|
||||||
// the original string slice and the iterator's remaining part
|
None => None,
|
||||||
let offset = self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
|
Some(ch) => {
|
||||||
self.iter.next().map(|ch| (offset, ch))
|
let index = self.front;
|
||||||
|
let (len, _) = self.iter.iter.size_hint();
|
||||||
|
self.front += self.back - self.front - len;
|
||||||
|
Some((index, ch))
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
|
@ -169,11 +248,14 @@ impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
|
||||||
impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> {
|
impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn next_back(&mut self) -> Option<(uint, char)> {
|
fn next_back(&mut self) -> Option<(uint, char)> {
|
||||||
self.iter.next_back().map(|ch| {
|
match self.iter.next_back() {
|
||||||
let offset = self.iter.string.len() +
|
None => None,
|
||||||
self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
|
Some(ch) => {
|
||||||
(offset, ch)
|
let (len, _) = self.iter.iter.size_hint();
|
||||||
})
|
self.back -= self.back - self.front - len;
|
||||||
|
Some((self.back, ch))
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -880,18 +962,6 @@ pub struct CharRange {
|
||||||
pub next: uint,
|
pub next: uint,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return the initial codepoint accumulator for the first byte.
|
|
||||||
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
|
|
||||||
// for width 3, and 3 bits for width 4
|
|
||||||
macro_rules! utf8_first_byte(
|
|
||||||
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
|
|
||||||
)
|
|
||||||
|
|
||||||
// return the value of $ch updated with continuation byte $byte
|
|
||||||
macro_rules! utf8_acc_cont_byte(
|
|
||||||
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
|
|
||||||
)
|
|
||||||
|
|
||||||
static TAG_CONT_U8: u8 = 128u8;
|
static TAG_CONT_U8: u8 = 128u8;
|
||||||
|
|
||||||
/// Unsafe operations
|
/// Unsafe operations
|
||||||
|
@ -1608,7 +1678,7 @@ impl<'a> StrSlice<'a> for &'a str {
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn chars(&self) -> Chars<'a> {
|
fn chars(&self) -> Chars<'a> {
|
||||||
Chars{string: *self}
|
Chars{iter: self.as_bytes().iter()}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
|
@ -1618,7 +1688,7 @@ impl<'a> StrSlice<'a> for &'a str {
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn char_indices(&self) -> CharOffsets<'a> {
|
fn char_indices(&self) -> CharOffsets<'a> {
|
||||||
CharOffsets{string: *self, iter: self.chars()}
|
CharOffsets{front: 0, back: self.len(), iter: self.chars()}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue