Auto merge of #27808 - SimonSapin:utf16decoder, r=alexcrichton
* Rename `Utf16Items` to `Utf16Decoder`. "Items" is meaningless. * Generalize it to any `u16` iterator, not just `[u16].iter()` * Make it yield `Result` instead of a custom `Utf16Item` enum that was isomorphic to `Result`. This enable using the `FromIterator for Result` impl. * Replace `Utf16Item::to_char_lossy` with a `Utf16Decoder::lossy` iterator adaptor. This is a [breaking change], but only for users of the unstable `rustc_unicode` crate. I’d like this functionality to be stabilized and re-exported in `std` eventually, as the "low-level equivalent" of `String::from_utf16` and `String::from_utf16_lossy` like #27784 is the low-level equivalent of #27714. CC @aturon, @alexcrichton
This commit is contained in:
commit
fd302a95e1
16 changed files with 293 additions and 171 deletions
|
@ -73,6 +73,7 @@ use boxed::Box;
|
|||
|
||||
use core::sync::atomic;
|
||||
use core::sync::atomic::Ordering::{Relaxed, Release, Acquire, SeqCst};
|
||||
use core::borrow;
|
||||
use core::fmt;
|
||||
use core::cmp::Ordering;
|
||||
use core::mem::{align_of_val, size_of_val};
|
||||
|
@ -1109,3 +1110,7 @@ mod tests {
|
|||
assert!(y.upgrade().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: ?Sized> borrow::Borrow<T> for Arc<T> {
|
||||
fn borrow(&self) -> &T { &**self }
|
||||
}
|
||||
|
|
|
@ -57,6 +57,7 @@ use heap;
|
|||
use raw_vec::RawVec;
|
||||
|
||||
use core::any::Any;
|
||||
use core::borrow;
|
||||
use core::cmp::Ordering;
|
||||
use core::fmt;
|
||||
use core::hash::{self, Hash};
|
||||
|
@ -562,3 +563,10 @@ impl<T: Clone> Clone for Box<[T]> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<T: ?Sized> borrow::Borrow<T> for Box<T> {
|
||||
fn borrow(&self) -> &T { &**self }
|
||||
}
|
||||
|
||||
impl<T: ?Sized> borrow::BorrowMut<T> for Box<T> {
|
||||
fn borrow_mut(&mut self) -> &mut T { &mut **self }
|
||||
}
|
||||
|
|
|
@ -158,6 +158,7 @@ use boxed::Box;
|
|||
#[cfg(test)]
|
||||
use std::boxed::Box;
|
||||
|
||||
use core::borrow;
|
||||
use core::cell::Cell;
|
||||
use core::cmp::Ordering;
|
||||
use core::fmt;
|
||||
|
@ -1091,3 +1092,7 @@ mod tests {
|
|||
assert_eq!(foo, foo.clone());
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: ?Sized> borrow::Borrow<T> for Rc<T> {
|
||||
fn borrow(&self) -> &T { &**self }
|
||||
}
|
||||
|
|
|
@ -21,119 +21,10 @@ use core::ops::Deref;
|
|||
use core::option::Option;
|
||||
|
||||
use fmt;
|
||||
use alloc::{boxed, rc, arc};
|
||||
|
||||
use self::Cow::*;
|
||||
|
||||
/// A trait for borrowing data.
|
||||
///
|
||||
/// In general, there may be several ways to "borrow" a piece of data. The
|
||||
/// typical ways of borrowing a type `T` are `&T` (a shared borrow) and `&mut T`
|
||||
/// (a mutable borrow). But types like `Vec<T>` provide additional kinds of
|
||||
/// borrows: the borrowed slices `&[T]` and `&mut [T]`.
|
||||
///
|
||||
/// When writing generic code, it is often desirable to abstract over all ways
|
||||
/// of borrowing data from a given type. That is the role of the `Borrow`
|
||||
/// trait: if `T: Borrow<U>`, then `&U` can be borrowed from `&T`. A given
|
||||
/// type can be borrowed as multiple different types. In particular, `Vec<T>:
|
||||
/// Borrow<Vec<T>>` and `Vec<T>: Borrow<[T]>`.
|
||||
///
|
||||
/// If you are implementing `Borrow` and both `Self` and `Borrowed` implement
|
||||
/// `Hash`, `Eq`, and/or `Ord`, they must produce the same result.
|
||||
///
|
||||
/// `Borrow` is very similar to, but different than, `AsRef`. See
|
||||
/// [the book][book] for more.
|
||||
///
|
||||
/// [book]: ../../book/borrow-and-asref.html
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
pub trait Borrow<Borrowed: ?Sized> {
|
||||
/// Immutably borrows from an owned value.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use std::borrow::Borrow;
|
||||
///
|
||||
/// fn check<T: Borrow<str>>(s: T) {
|
||||
/// assert_eq!("Hello", s.borrow());
|
||||
/// }
|
||||
///
|
||||
/// let s = "Hello".to_string();
|
||||
///
|
||||
/// check(s);
|
||||
///
|
||||
/// let s = "Hello";
|
||||
///
|
||||
/// check(s);
|
||||
/// ```
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
fn borrow(&self) -> &Borrowed;
|
||||
}
|
||||
|
||||
/// A trait for mutably borrowing data.
|
||||
///
|
||||
/// Similar to `Borrow`, but for mutable borrows.
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
pub trait BorrowMut<Borrowed: ?Sized> : Borrow<Borrowed> {
|
||||
/// Mutably borrows from an owned value.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use std::borrow::BorrowMut;
|
||||
///
|
||||
/// fn check<T: BorrowMut<[i32]>>(mut v: T) {
|
||||
/// assert_eq!(&mut [1, 2, 3], v.borrow_mut());
|
||||
/// }
|
||||
///
|
||||
/// let v = vec![1, 2, 3];
|
||||
///
|
||||
/// check(v);
|
||||
/// ```
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
fn borrow_mut(&mut self) -> &mut Borrowed;
|
||||
}
|
||||
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
impl<T: ?Sized> Borrow<T> for T {
|
||||
fn borrow(&self) -> &T { self }
|
||||
}
|
||||
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
impl<T: ?Sized> BorrowMut<T> for T {
|
||||
fn borrow_mut(&mut self) -> &mut T { self }
|
||||
}
|
||||
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
impl<'a, T: ?Sized> Borrow<T> for &'a T {
|
||||
fn borrow(&self) -> &T { &**self }
|
||||
}
|
||||
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
impl<'a, T: ?Sized> Borrow<T> for &'a mut T {
|
||||
fn borrow(&self) -> &T { &**self }
|
||||
}
|
||||
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
impl<'a, T: ?Sized> BorrowMut<T> for &'a mut T {
|
||||
fn borrow_mut(&mut self) -> &mut T { &mut **self }
|
||||
}
|
||||
|
||||
impl<T: ?Sized> Borrow<T> for boxed::Box<T> {
|
||||
fn borrow(&self) -> &T { &**self }
|
||||
}
|
||||
|
||||
impl<T: ?Sized> BorrowMut<T> for boxed::Box<T> {
|
||||
fn borrow_mut(&mut self) -> &mut T { &mut **self }
|
||||
}
|
||||
|
||||
impl<T: ?Sized> Borrow<T> for rc::Rc<T> {
|
||||
fn borrow(&self) -> &T { &**self }
|
||||
}
|
||||
|
||||
impl<T: ?Sized> Borrow<T> for arc::Arc<T> {
|
||||
fn borrow(&self) -> &T { &**self }
|
||||
}
|
||||
pub use core::borrow::{Borrow, BorrowMut};
|
||||
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
impl<'a, B: ?Sized> Borrow<B> for Cow<'a, B> where B: ToOwned, <B as ToOwned>::Owned: 'a {
|
||||
|
|
|
@ -56,6 +56,7 @@
|
|||
#![feature(unicode)]
|
||||
#![feature(unique)]
|
||||
#![feature(unsafe_no_drop_flag, filling_drop)]
|
||||
#![feature(decode_utf16)]
|
||||
#![feature(utf8_error)]
|
||||
#![cfg_attr(test, feature(rand, test))]
|
||||
|
||||
|
|
|
@ -20,8 +20,8 @@ use core::ops::{self, Deref, Add, Index};
|
|||
use core::ptr;
|
||||
use core::slice;
|
||||
use core::str::pattern::Pattern;
|
||||
use rustc_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER};
|
||||
use rustc_unicode::str as unicode_str;
|
||||
use rustc_unicode::str::Utf16Item;
|
||||
|
||||
use borrow::{Cow, IntoCow};
|
||||
use range::RangeArgument;
|
||||
|
@ -267,14 +267,7 @@ impl String {
|
|||
/// ```
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
pub fn from_utf16(v: &[u16]) -> Result<String, FromUtf16Error> {
|
||||
let mut s = String::with_capacity(v.len());
|
||||
for c in unicode_str::utf16_items(v) {
|
||||
match c {
|
||||
Utf16Item::ScalarValue(c) => s.push(c),
|
||||
Utf16Item::LoneSurrogate(_) => return Err(FromUtf16Error(())),
|
||||
}
|
||||
}
|
||||
Ok(s)
|
||||
decode_utf16(v.iter().cloned()).collect::<Result<_, _>>().map_err(|_| FromUtf16Error(()))
|
||||
}
|
||||
|
||||
/// Decode a UTF-16 encoded vector `v` into a string, replacing
|
||||
|
@ -294,7 +287,7 @@ impl String {
|
|||
#[inline]
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
pub fn from_utf16_lossy(v: &[u16]) -> String {
|
||||
unicode_str::utf16_items(v).map(|c| c.to_char_lossy()).collect()
|
||||
decode_utf16(v.iter().cloned()).map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)).collect()
|
||||
}
|
||||
|
||||
/// Creates a new `String` from a length, capacity, and pointer.
|
||||
|
|
109
src/libcore/borrow.rs
Normal file
109
src/libcore/borrow.rs
Normal file
|
@ -0,0 +1,109 @@
|
|||
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! A module for working with borrowed data.
|
||||
|
||||
#![stable(feature = "rust1", since = "1.0.0")]
|
||||
|
||||
use marker::Sized;
|
||||
|
||||
/// A trait for borrowing data.
|
||||
///
|
||||
/// In general, there may be several ways to "borrow" a piece of data. The
|
||||
/// typical ways of borrowing a type `T` are `&T` (a shared borrow) and `&mut T`
|
||||
/// (a mutable borrow). But types like `Vec<T>` provide additional kinds of
|
||||
/// borrows: the borrowed slices `&[T]` and `&mut [T]`.
|
||||
///
|
||||
/// When writing generic code, it is often desirable to abstract over all ways
|
||||
/// of borrowing data from a given type. That is the role of the `Borrow`
|
||||
/// trait: if `T: Borrow<U>`, then `&U` can be borrowed from `&T`. A given
|
||||
/// type can be borrowed as multiple different types. In particular, `Vec<T>:
|
||||
/// Borrow<Vec<T>>` and `Vec<T>: Borrow<[T]>`.
|
||||
///
|
||||
/// If you are implementing `Borrow` and both `Self` and `Borrowed` implement
|
||||
/// `Hash`, `Eq`, and/or `Ord`, they must produce the same result.
|
||||
///
|
||||
/// `Borrow` is very similar to, but different than, `AsRef`. See
|
||||
/// [the book][book] for more.
|
||||
///
|
||||
/// [book]: ../../book/borrow-and-asref.html
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
pub trait Borrow<Borrowed: ?Sized> {
|
||||
/// Immutably borrows from an owned value.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use std::borrow::Borrow;
|
||||
///
|
||||
/// fn check<T: Borrow<str>>(s: T) {
|
||||
/// assert_eq!("Hello", s.borrow());
|
||||
/// }
|
||||
///
|
||||
/// let s = "Hello".to_string();
|
||||
///
|
||||
/// check(s);
|
||||
///
|
||||
/// let s = "Hello";
|
||||
///
|
||||
/// check(s);
|
||||
/// ```
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
fn borrow(&self) -> &Borrowed;
|
||||
}
|
||||
|
||||
/// A trait for mutably borrowing data.
|
||||
///
|
||||
/// Similar to `Borrow`, but for mutable borrows.
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
pub trait BorrowMut<Borrowed: ?Sized> : Borrow<Borrowed> {
|
||||
/// Mutably borrows from an owned value.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use std::borrow::BorrowMut;
|
||||
///
|
||||
/// fn check<T: BorrowMut<[i32]>>(mut v: T) {
|
||||
/// assert_eq!(&mut [1, 2, 3], v.borrow_mut());
|
||||
/// }
|
||||
///
|
||||
/// let v = vec![1, 2, 3];
|
||||
///
|
||||
/// check(v);
|
||||
/// ```
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
fn borrow_mut(&mut self) -> &mut Borrowed;
|
||||
}
|
||||
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
impl<T: ?Sized> Borrow<T> for T {
|
||||
fn borrow(&self) -> &T { self }
|
||||
}
|
||||
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
impl<T: ?Sized> BorrowMut<T> for T {
|
||||
fn borrow_mut(&mut self) -> &mut T { self }
|
||||
}
|
||||
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
impl<'a, T: ?Sized> Borrow<T> for &'a T {
|
||||
fn borrow(&self) -> &T { &**self }
|
||||
}
|
||||
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
impl<'a, T: ?Sized> Borrow<T> for &'a mut T {
|
||||
fn borrow(&self) -> &T { &**self }
|
||||
}
|
||||
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
impl<'a, T: ?Sized> BorrowMut<T> for &'a mut T {
|
||||
fn borrow_mut(&mut self) -> &mut T { &mut **self }
|
||||
}
|
|
@ -139,6 +139,7 @@ pub mod cmp;
|
|||
pub mod clone;
|
||||
pub mod default;
|
||||
pub mod convert;
|
||||
pub mod borrow;
|
||||
|
||||
/* Core types and methods on primitives */
|
||||
|
||||
|
|
|
@ -207,3 +207,12 @@ fn test_len_utf16() {
|
|||
assert!('\u{a66e}'.len_utf16() == 1);
|
||||
assert!('\u{1f4a9}'.len_utf16() == 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_utf16() {
|
||||
fn check(s: &[u16], expected: &[Result<char, u16>]) {
|
||||
assert_eq!(::std::char::decode_utf16(s.iter().cloned()).collect::<Vec<_>>(), expected);
|
||||
}
|
||||
check(&[0xD800, 0x41, 0x42], &[Err(0xD800), Ok('A'), Ok('B')]);
|
||||
check(&[0xD800, 0], &[Err(0xD800), Ok('\0')]);
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#![feature(float_from_str_radix)]
|
||||
#![feature(flt2dec)]
|
||||
#![feature(dec2flt)]
|
||||
#![feature(decode_utf16)]
|
||||
#![feature(fmt_radix)]
|
||||
#![feature(iter_arith)]
|
||||
#![feature(iter_arith)]
|
||||
|
|
|
@ -503,3 +503,116 @@ impl char {
|
|||
ToUppercase(CaseMappingIter::new(conversions::to_upper(self)))
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator that decodes UTF-16 encoded codepoints from an iterator of `u16`s.
|
||||
#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
|
||||
#[derive(Clone)]
|
||||
pub struct DecodeUtf16<I> where I: Iterator<Item=u16> {
|
||||
iter: I,
|
||||
buf: Option<u16>,
|
||||
}
|
||||
|
||||
/// Create an iterator over the UTF-16 encoded codepoints in `iterable`,
|
||||
/// returning unpaired surrogates as `Err`s.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// #![feature(decode_utf16)]
|
||||
///
|
||||
/// use std::char::decode_utf16;
|
||||
///
|
||||
/// fn main() {
|
||||
/// // 𝄞mus<invalid>ic<invalid>
|
||||
/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
|
||||
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
|
||||
/// 0xD834];
|
||||
///
|
||||
/// assert_eq!(decode_utf16(v.iter().cloned()).collect::<Vec<_>>(),
|
||||
/// vec![Ok('𝄞'),
|
||||
/// Ok('m'), Ok('u'), Ok('s'),
|
||||
/// Err(0xDD1E),
|
||||
/// Ok('i'), Ok('c'),
|
||||
/// Err(0xD834)]);
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// A lossy decoder can be obtained by replacing `Err` results with the replacement character:
|
||||
///
|
||||
/// ```
|
||||
/// #![feature(decode_utf16)]
|
||||
///
|
||||
/// use std::char::{decode_utf16, REPLACEMENT_CHARACTER};
|
||||
///
|
||||
/// fn main() {
|
||||
/// // 𝄞mus<invalid>ic<invalid>
|
||||
/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
|
||||
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
|
||||
/// 0xD834];
|
||||
///
|
||||
/// assert_eq!(decode_utf16(v.iter().cloned())
|
||||
/// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
|
||||
/// .collect::<String>(),
|
||||
/// "𝄞mus<75>ic<69>");
|
||||
/// }
|
||||
/// ```
|
||||
#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
|
||||
#[inline]
|
||||
pub fn decode_utf16<I: IntoIterator<Item=u16>>(iterable: I) -> DecodeUtf16<I::IntoIter> {
|
||||
DecodeUtf16 {
|
||||
iter: iterable.into_iter(),
|
||||
buf: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
|
||||
impl<I: Iterator<Item=u16>> Iterator for DecodeUtf16<I> {
|
||||
type Item = Result<char, u16>;
|
||||
|
||||
fn next(&mut self) -> Option<Result<char, u16>> {
|
||||
let u = match self.buf.take() {
|
||||
Some(buf) => buf,
|
||||
None => match self.iter.next() {
|
||||
Some(u) => u,
|
||||
None => return None
|
||||
}
|
||||
};
|
||||
|
||||
if u < 0xD800 || 0xDFFF < u {
|
||||
// not a surrogate
|
||||
Some(Ok(unsafe { from_u32_unchecked(u as u32) }))
|
||||
} else if u >= 0xDC00 {
|
||||
// a trailing surrogate
|
||||
Some(Err(u))
|
||||
} else {
|
||||
let u2 = match self.iter.next() {
|
||||
Some(u2) => u2,
|
||||
// eof
|
||||
None => return Some(Err(u))
|
||||
};
|
||||
if u2 < 0xDC00 || u2 > 0xDFFF {
|
||||
// not a trailing surrogate so we're not a valid
|
||||
// surrogate pair, so rewind to redecode u2 next time.
|
||||
self.buf = Some(u2);
|
||||
return Some(Err(u))
|
||||
}
|
||||
|
||||
// all ok, so lets decode it.
|
||||
let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
|
||||
Some(Ok(unsafe { from_u32_unchecked(c) }))
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let (low, high) = self.iter.size_hint();
|
||||
// we could be entirely valid surrogates (2 elements per
|
||||
// char), or entirely non-surrogates (1 element per char)
|
||||
(low / 2, high)
|
||||
}
|
||||
}
|
||||
|
||||
/// U+FFFD REPLACEMENT CHARACTER (<28>) is used in Unicode to represent a decoding error.
|
||||
/// It can occur, for example, when giving ill-formed UTF-8 bytes to `String::from_utf8_lossy`.
|
||||
#[unstable(feature = "decode_utf16", reason = "recently added", issue = "27830")]
|
||||
pub const REPLACEMENT_CHARACTER: char = '\u{FFFD}';
|
||||
|
|
|
@ -46,6 +46,7 @@ mod tables;
|
|||
mod u_str;
|
||||
pub mod char;
|
||||
|
||||
#[allow(deprecated)]
|
||||
pub mod str {
|
||||
pub use u_str::{UnicodeStr, SplitWhitespace};
|
||||
pub use u_str::{utf8_char_width, is_utf16, Utf16Items, Utf16Item};
|
||||
|
|
|
@ -13,8 +13,9 @@
|
|||
//! This module provides functionality to `str` that requires the Unicode methods provided by the
|
||||
//! unicode parts of the CharExt trait.
|
||||
|
||||
use char::{DecodeUtf16, decode_utf16};
|
||||
use core::char;
|
||||
use core::iter::Filter;
|
||||
use core::iter::{Cloned, Filter};
|
||||
use core::slice;
|
||||
use core::str::Split;
|
||||
|
||||
|
@ -119,11 +120,18 @@ pub fn is_utf16(v: &[u16]) -> bool {
|
|||
|
||||
/// An iterator that decodes UTF-16 encoded codepoints from a vector
|
||||
/// of `u16`s.
|
||||
#[deprecated(since = "1.4.0", reason = "renamed to `char::DecodeUtf16`")]
|
||||
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
|
||||
#[allow(deprecated)]
|
||||
#[derive(Clone)]
|
||||
pub struct Utf16Items<'a> {
|
||||
iter: slice::Iter<'a, u16>
|
||||
decoder: DecodeUtf16<Cloned<slice::Iter<'a, u16>>>
|
||||
}
|
||||
|
||||
/// The possibilities for values decoded from a `u16` stream.
|
||||
#[deprecated(since = "1.4.0", reason = "`char::DecodeUtf16` uses `Result<char, u16>` instead")]
|
||||
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
|
||||
#[allow(deprecated)]
|
||||
#[derive(Copy, PartialEq, Eq, Clone, Debug)]
|
||||
pub enum Utf16Item {
|
||||
/// A valid codepoint.
|
||||
|
@ -132,6 +140,7 @@ pub enum Utf16Item {
|
|||
LoneSurrogate(u16)
|
||||
}
|
||||
|
||||
#[allow(deprecated)]
|
||||
impl Utf16Item {
|
||||
/// Convert `self` to a `char`, taking `LoneSurrogate`s to the
|
||||
/// replacement character (U+FFFD).
|
||||
|
@ -144,49 +153,22 @@ impl Utf16Item {
|
|||
}
|
||||
}
|
||||
|
||||
#[deprecated(since = "1.4.0", reason = "use `char::DecodeUtf16` instead")]
|
||||
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
|
||||
#[allow(deprecated)]
|
||||
impl<'a> Iterator for Utf16Items<'a> {
|
||||
type Item = Utf16Item;
|
||||
|
||||
fn next(&mut self) -> Option<Utf16Item> {
|
||||
let u = match self.iter.next() {
|
||||
Some(u) => *u,
|
||||
None => return None
|
||||
};
|
||||
|
||||
if u < 0xD800 || 0xDFFF < u {
|
||||
// not a surrogate
|
||||
Some(Utf16Item::ScalarValue(unsafe { char::from_u32_unchecked(u as u32) }))
|
||||
} else if u >= 0xDC00 {
|
||||
// a trailing surrogate
|
||||
Some(Utf16Item::LoneSurrogate(u))
|
||||
} else {
|
||||
// preserve state for rewinding.
|
||||
let old = self.iter.clone();
|
||||
|
||||
let u2 = match self.iter.next() {
|
||||
Some(u2) => *u2,
|
||||
// eof
|
||||
None => return Some(Utf16Item::LoneSurrogate(u))
|
||||
};
|
||||
if u2 < 0xDC00 || u2 > 0xDFFF {
|
||||
// not a trailing surrogate so we're not a valid
|
||||
// surrogate pair, so rewind to redecode u2 next time.
|
||||
self.iter = old.clone();
|
||||
return Some(Utf16Item::LoneSurrogate(u))
|
||||
}
|
||||
|
||||
// all ok, so lets decode it.
|
||||
let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
|
||||
Some(Utf16Item::ScalarValue(unsafe { char::from_u32_unchecked(c) }))
|
||||
}
|
||||
self.decoder.next().map(|result| match result {
|
||||
Ok(c) => Utf16Item::ScalarValue(c),
|
||||
Err(s) => Utf16Item::LoneSurrogate(s),
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let (low, high) = self.iter.size_hint();
|
||||
// we could be entirely valid surrogates (2 elements per
|
||||
// char), or entirely non-surrogates (1 element per char)
|
||||
(low / 2, high)
|
||||
self.decoder.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -196,7 +178,7 @@ impl<'a> Iterator for Utf16Items<'a> {
|
|||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// #![feature(unicode)]
|
||||
/// #![feature(unicode, decode_utf16)]
|
||||
///
|
||||
/// extern crate rustc_unicode;
|
||||
///
|
||||
|
@ -216,8 +198,11 @@ impl<'a> Iterator for Utf16Items<'a> {
|
|||
/// LoneSurrogate(0xD834)]);
|
||||
/// }
|
||||
/// ```
|
||||
#[deprecated(since = "1.4.0", reason = "renamed to `char::decode_utf16`")]
|
||||
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
|
||||
#[allow(deprecated)]
|
||||
pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> {
|
||||
Utf16Items { iter : v.iter() }
|
||||
Utf16Items { decoder: decode_utf16(v.iter().cloned()) }
|
||||
}
|
||||
|
||||
/// Iterator adaptor for encoding `char`s to UTF-16.
|
||||
|
|
|
@ -209,8 +209,6 @@ use std::str::FromStr;
|
|||
use std::string;
|
||||
use std::{char, f64, fmt, str};
|
||||
use std;
|
||||
use rustc_unicode::str as unicode_str;
|
||||
use rustc_unicode::str::Utf16Item;
|
||||
|
||||
use Encodable;
|
||||
|
||||
|
@ -1712,11 +1710,13 @@ impl<T: Iterator<Item=char>> Parser<T> {
|
|||
_ => return self.error(UnexpectedEndOfHexEscape),
|
||||
}
|
||||
|
||||
let buf = [n1, try!(self.decode_hex_escape())];
|
||||
match unicode_str::utf16_items(&buf).next() {
|
||||
Some(Utf16Item::ScalarValue(c)) => res.push(c),
|
||||
_ => return self.error(LoneLeadingSurrogateInHexEscape),
|
||||
let n2 = try!(self.decode_hex_escape());
|
||||
if n2 < 0xDC00 || n2 > 0xDFFF {
|
||||
return self.error(LoneLeadingSurrogateInHexEscape)
|
||||
}
|
||||
let c = (((n1 - 0xD800) as u32) << 10 |
|
||||
(n2 - 0xDC00) as u32) + 0x1_0000;
|
||||
res.push(char::from_u32(c).unwrap());
|
||||
}
|
||||
|
||||
n => match char::from_u32(n as u32) {
|
||||
|
|
|
@ -242,6 +242,7 @@
|
|||
#![feature(unicode)]
|
||||
#![feature(unique)]
|
||||
#![feature(unsafe_no_drop_flag, filling_drop)]
|
||||
#![feature(decode_utf16)]
|
||||
#![feature(vec_push_all)]
|
||||
#![feature(vec_resize)]
|
||||
#![feature(wrapping)]
|
||||
|
|
|
@ -37,7 +37,6 @@ use hash::{Hash, Hasher};
|
|||
use iter::FromIterator;
|
||||
use mem;
|
||||
use ops;
|
||||
use rustc_unicode::str::{Utf16Item, utf16_items};
|
||||
use slice;
|
||||
use str;
|
||||
use string::String;
|
||||
|
@ -186,14 +185,14 @@ impl Wtf8Buf {
|
|||
/// will always return the original code units.
|
||||
pub fn from_wide(v: &[u16]) -> Wtf8Buf {
|
||||
let mut string = Wtf8Buf::with_capacity(v.len());
|
||||
for item in utf16_items(v) {
|
||||
for item in char::decode_utf16(v.iter().cloned()) {
|
||||
match item {
|
||||
Utf16Item::ScalarValue(c) => string.push_char(c),
|
||||
Utf16Item::LoneSurrogate(s) => {
|
||||
Ok(ch) => string.push_char(ch),
|
||||
Err(surrogate) => {
|
||||
// Surrogates are known to be in the code point range.
|
||||
let code_point = unsafe { CodePoint::from_u32_unchecked(s as u32) };
|
||||
let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
|
||||
// Skip the WTF-8 concatenation check,
|
||||
// surrogate pairs are already decoded by utf16_items
|
||||
// surrogate pairs are already decoded by decode_utf16
|
||||
string.push_code_point_unchecked(code_point)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue