Expose Utf8Lossy
as Utf8Chunks
This commit is contained in:
parent
be9cfb307e
commit
e8ee0b7b2b
10 changed files with 273 additions and 184 deletions
|
@ -141,6 +141,7 @@
|
||||||
#![feature(unchecked_math)]
|
#![feature(unchecked_math)]
|
||||||
#![feature(unicode_internals)]
|
#![feature(unicode_internals)]
|
||||||
#![feature(unsize)]
|
#![feature(unsize)]
|
||||||
|
#![feature(utf8_chunks)]
|
||||||
#![feature(std_internals)]
|
#![feature(std_internals)]
|
||||||
//
|
//
|
||||||
// Language features:
|
// Language features:
|
||||||
|
|
|
@ -71,6 +71,8 @@ pub use core::str::{RSplit, Split};
|
||||||
pub use core::str::{RSplitN, SplitN};
|
pub use core::str::{RSplitN, SplitN};
|
||||||
#[stable(feature = "rust1", since = "1.0.0")]
|
#[stable(feature = "rust1", since = "1.0.0")]
|
||||||
pub use core::str::{RSplitTerminator, SplitTerminator};
|
pub use core::str::{RSplitTerminator, SplitTerminator};
|
||||||
|
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||||
|
pub use core::str::{Utf8Chunk, Utf8Chunks};
|
||||||
|
|
||||||
/// Note: `str` in `Concat<str>` is not meaningful here.
|
/// Note: `str` in `Concat<str>` is not meaningful here.
|
||||||
/// This type parameter of the trait only exists to enable another impl.
|
/// This type parameter of the trait only exists to enable another impl.
|
||||||
|
|
|
@ -58,9 +58,9 @@ use core::ops::Bound::{Excluded, Included, Unbounded};
|
||||||
use core::ops::{self, Index, IndexMut, Range, RangeBounds};
|
use core::ops::{self, Index, IndexMut, Range, RangeBounds};
|
||||||
use core::ptr;
|
use core::ptr;
|
||||||
use core::slice;
|
use core::slice;
|
||||||
#[cfg(not(no_global_oom_handling))]
|
|
||||||
use core::str::lossy;
|
|
||||||
use core::str::pattern::Pattern;
|
use core::str::pattern::Pattern;
|
||||||
|
#[cfg(not(no_global_oom_handling))]
|
||||||
|
use core::str::Utf8Chunks;
|
||||||
|
|
||||||
#[cfg(not(no_global_oom_handling))]
|
#[cfg(not(no_global_oom_handling))]
|
||||||
use crate::borrow::{Cow, ToOwned};
|
use crate::borrow::{Cow, ToOwned};
|
||||||
|
@ -628,11 +628,11 @@ impl String {
|
||||||
#[cfg(not(no_global_oom_handling))]
|
#[cfg(not(no_global_oom_handling))]
|
||||||
#[stable(feature = "rust1", since = "1.0.0")]
|
#[stable(feature = "rust1", since = "1.0.0")]
|
||||||
pub fn from_utf8_lossy(v: &[u8]) -> Cow<'_, str> {
|
pub fn from_utf8_lossy(v: &[u8]) -> Cow<'_, str> {
|
||||||
let mut iter = lossy::Utf8Lossy::from_bytes(v).chunks();
|
let mut iter = Utf8Chunks::new(v);
|
||||||
|
|
||||||
let first_valid = if let Some(chunk) = iter.next() {
|
let first_valid = if let Some(chunk) = iter.next() {
|
||||||
let lossy::Utf8LossyChunk { valid, broken } = chunk;
|
let valid = chunk.valid();
|
||||||
if broken.is_empty() {
|
if chunk.invalid().is_empty() {
|
||||||
debug_assert_eq!(valid.len(), v.len());
|
debug_assert_eq!(valid.len(), v.len());
|
||||||
return Cow::Borrowed(valid);
|
return Cow::Borrowed(valid);
|
||||||
}
|
}
|
||||||
|
@ -647,9 +647,9 @@ impl String {
|
||||||
res.push_str(first_valid);
|
res.push_str(first_valid);
|
||||||
res.push_str(REPLACEMENT);
|
res.push_str(REPLACEMENT);
|
||||||
|
|
||||||
for lossy::Utf8LossyChunk { valid, broken } in iter {
|
for chunk in iter {
|
||||||
res.push_str(valid);
|
res.push_str(chunk.valid());
|
||||||
if !broken.is_empty() {
|
if !chunk.invalid().is_empty() {
|
||||||
res.push_str(REPLACEMENT);
|
res.push_str(REPLACEMENT);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,51 +1,170 @@
|
||||||
use crate::char;
|
use crate::fmt;
|
||||||
use crate::fmt::{self, Write};
|
use crate::fmt::Formatter;
|
||||||
use crate::mem;
|
use crate::fmt::Write;
|
||||||
|
use crate::iter::FusedIterator;
|
||||||
|
|
||||||
use super::from_utf8_unchecked;
|
use super::from_utf8_unchecked;
|
||||||
use super::validations::utf8_char_width;
|
use super::validations::utf8_char_width;
|
||||||
|
|
||||||
/// Lossy UTF-8 string.
|
/// An item returned by the [`Utf8Chunks`] iterator.
|
||||||
#[unstable(feature = "str_internals", issue = "none")]
|
///
|
||||||
pub struct Utf8Lossy {
|
/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
|
||||||
bytes: [u8],
|
/// when decoding a UTF-8 string.
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// #![feature(utf8_chunks)]
|
||||||
|
///
|
||||||
|
/// use std::str::Utf8Chunks;
|
||||||
|
///
|
||||||
|
/// // An invalid UTF-8 string
|
||||||
|
/// let bytes = b"foo\xF1\x80bar";
|
||||||
|
///
|
||||||
|
/// // Decode the first `Utf8Chunk`
|
||||||
|
/// let chunk = Utf8Chunks::new(bytes).next().unwrap();
|
||||||
|
///
|
||||||
|
/// // The first three characters are valid UTF-8
|
||||||
|
/// assert_eq!("foo", chunk.valid());
|
||||||
|
///
|
||||||
|
/// // The fourth character is broken
|
||||||
|
/// assert_eq!(b"\xF1\x80", chunk.invalid());
|
||||||
|
/// ```
|
||||||
|
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||||
|
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||||
|
pub struct Utf8Chunk<'a> {
|
||||||
|
valid: &'a str,
|
||||||
|
invalid: &'a [u8],
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Utf8Lossy {
|
impl<'a> Utf8Chunk<'a> {
|
||||||
|
/// Returns the next validated UTF-8 substring.
|
||||||
|
///
|
||||||
|
/// This substring can be empty at the start of the string or between
|
||||||
|
/// broken UTF-8 characters.
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
|
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||||
// SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required.
|
pub fn valid(&self) -> &'a str {
|
||||||
unsafe { mem::transmute(bytes) }
|
self.valid
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn chunks(&self) -> Utf8LossyChunksIter<'_> {
|
/// Returns the invalid sequence that caused a failure.
|
||||||
Utf8LossyChunksIter { source: &self.bytes }
|
///
|
||||||
|
/// The returned slice will have a maximum length of 3 and starts after the
|
||||||
|
/// substring given by [`valid`]. Decoding will resume after this sequence.
|
||||||
|
///
|
||||||
|
/// If empty, this is the last chunk in the string. If non-empty, an
|
||||||
|
/// unexpected byte was encountered or the end of the input was reached
|
||||||
|
/// unexpectedly.
|
||||||
|
///
|
||||||
|
/// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT
|
||||||
|
/// CHARACTER`].
|
||||||
|
///
|
||||||
|
/// [`valid`]: Self::valid
|
||||||
|
/// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
|
||||||
|
#[must_use]
|
||||||
|
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||||
|
pub fn invalid(&self) -> &'a [u8] {
|
||||||
|
self.invalid
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Iterator over lossy UTF-8 string
|
#[must_use]
|
||||||
#[must_use = "iterators are lazy and do nothing unless consumed"]
|
|
||||||
#[unstable(feature = "str_internals", issue = "none")]
|
#[unstable(feature = "str_internals", issue = "none")]
|
||||||
#[allow(missing_debug_implementations)]
|
pub struct Debug<'a>(&'a [u8]);
|
||||||
pub struct Utf8LossyChunksIter<'a> {
|
|
||||||
|
#[unstable(feature = "str_internals", issue = "none")]
|
||||||
|
impl fmt::Debug for Debug<'_> {
|
||||||
|
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
|
||||||
|
f.write_char('"')?;
|
||||||
|
|
||||||
|
for chunk in Utf8Chunks::new(self.0) {
|
||||||
|
// Valid part.
|
||||||
|
// Here we partially parse UTF-8 again which is suboptimal.
|
||||||
|
{
|
||||||
|
let valid = chunk.valid();
|
||||||
|
let mut from = 0;
|
||||||
|
for (i, c) in valid.char_indices() {
|
||||||
|
let esc = c.escape_debug();
|
||||||
|
// If char needs escaping, flush backlog so far and write, else skip
|
||||||
|
if esc.len() != 1 {
|
||||||
|
f.write_str(&valid[from..i])?;
|
||||||
|
for c in esc {
|
||||||
|
f.write_char(c)?;
|
||||||
|
}
|
||||||
|
from = i + c.len_utf8();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
f.write_str(&valid[from..])?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Broken parts of string as hex escape.
|
||||||
|
for &b in chunk.invalid() {
|
||||||
|
write!(f, "\\x{:02X}", b)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.write_char('"')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices
|
||||||
|
/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]).
|
||||||
|
///
|
||||||
|
/// If you want a simple conversion from UTF-8 byte slices to string slices,
|
||||||
|
/// [`from_utf8`] is easier to use.
|
||||||
|
///
|
||||||
|
/// [byteslice]: slice
|
||||||
|
/// [`from_utf8`]: super::from_utf8
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
///
|
||||||
|
/// This can be used to create functionality similar to
|
||||||
|
/// [`String::from_utf8_lossy`] without allocating heap memory:
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// #![feature(utf8_chunks)]
|
||||||
|
///
|
||||||
|
/// use std::str::Utf8Chunks;
|
||||||
|
///
|
||||||
|
/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
|
||||||
|
/// for chunk in Utf8Chunks::new(input) {
|
||||||
|
/// push(chunk.valid());
|
||||||
|
///
|
||||||
|
/// if !chunk.invalid().is_empty() {
|
||||||
|
/// push("\u{FFFD}");
|
||||||
|
/// }
|
||||||
|
/// }
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
|
||||||
|
#[must_use = "iterators are lazy and do nothing unless consumed"]
|
||||||
|
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct Utf8Chunks<'a> {
|
||||||
source: &'a [u8],
|
source: &'a [u8],
|
||||||
}
|
}
|
||||||
|
|
||||||
#[unstable(feature = "str_internals", issue = "none")]
|
impl<'a> Utf8Chunks<'a> {
|
||||||
#[derive(PartialEq, Eq, Debug)]
|
/// Creates a new iterator to decode the bytes.
|
||||||
pub struct Utf8LossyChunk<'a> {
|
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||||
/// Sequence of valid chars.
|
pub fn new(bytes: &'a [u8]) -> Self {
|
||||||
/// Can be empty between broken UTF-8 chars.
|
Self { source: bytes }
|
||||||
pub valid: &'a str,
|
}
|
||||||
/// Single broken char, empty if none.
|
|
||||||
/// Empty iff iterator item is last.
|
#[doc(hidden)]
|
||||||
pub broken: &'a [u8],
|
#[unstable(feature = "str_internals", issue = "none")]
|
||||||
|
pub fn debug(&self) -> Debug<'_> {
|
||||||
|
Debug(self.source)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Iterator for Utf8LossyChunksIter<'a> {
|
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||||
type Item = Utf8LossyChunk<'a>;
|
impl<'a> Iterator for Utf8Chunks<'a> {
|
||||||
|
type Item = Utf8Chunk<'a>;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
|
fn next(&mut self) -> Option<Utf8Chunk<'a>> {
|
||||||
if self.source.is_empty() {
|
if self.source.is_empty() {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
@ -130,71 +249,22 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
|
||||||
|
|
||||||
// SAFETY: `valid_up_to <= i` because it is only ever assigned via
|
// SAFETY: `valid_up_to <= i` because it is only ever assigned via
|
||||||
// `valid_up_to = i` and `i` only increases.
|
// `valid_up_to = i` and `i` only increases.
|
||||||
let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) };
|
let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) };
|
||||||
|
|
||||||
Some(Utf8LossyChunk {
|
Some(Utf8Chunk {
|
||||||
// SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
|
// SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
|
||||||
valid: unsafe { from_utf8_unchecked(valid) },
|
valid: unsafe { from_utf8_unchecked(valid) },
|
||||||
broken,
|
invalid,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Display for Utf8Lossy {
|
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
impl FusedIterator for Utf8Chunks<'_> {}
|
||||||
// If we're the empty string then our iterator won't actually yield
|
|
||||||
// anything, so perform the formatting manually
|
|
||||||
if self.bytes.is_empty() {
|
|
||||||
return "".fmt(f);
|
|
||||||
}
|
|
||||||
|
|
||||||
for Utf8LossyChunk { valid, broken } in self.chunks() {
|
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||||
// If we successfully decoded the whole chunk as a valid string then
|
impl fmt::Debug for Utf8Chunks<'_> {
|
||||||
// we can return a direct formatting of the string which will also
|
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
|
||||||
// respect various formatting flags if possible.
|
f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
|
||||||
if valid.len() == self.bytes.len() {
|
|
||||||
assert!(broken.is_empty());
|
|
||||||
return valid.fmt(f);
|
|
||||||
}
|
|
||||||
|
|
||||||
f.write_str(valid)?;
|
|
||||||
if !broken.is_empty() {
|
|
||||||
f.write_char(char::REPLACEMENT_CHARACTER)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Debug for Utf8Lossy {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
||||||
f.write_char('"')?;
|
|
||||||
|
|
||||||
for Utf8LossyChunk { valid, broken } in self.chunks() {
|
|
||||||
// Valid part.
|
|
||||||
// Here we partially parse UTF-8 again which is suboptimal.
|
|
||||||
{
|
|
||||||
let mut from = 0;
|
|
||||||
for (i, c) in valid.char_indices() {
|
|
||||||
let esc = c.escape_debug();
|
|
||||||
// If char needs escaping, flush backlog so far and write, else skip
|
|
||||||
if esc.len() != 1 {
|
|
||||||
f.write_str(&valid[from..i])?;
|
|
||||||
for c in esc {
|
|
||||||
f.write_char(c)?;
|
|
||||||
}
|
|
||||||
from = i + c.len_utf8();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
f.write_str(&valid[from..])?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Broken parts of string as hex escape.
|
|
||||||
for &b in broken {
|
|
||||||
write!(f, "\\x{:02x}", b)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
f.write_char('"')
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,9 +22,9 @@ use crate::slice::{self, SliceIndex};
|
||||||
|
|
||||||
pub mod pattern;
|
pub mod pattern;
|
||||||
|
|
||||||
#[unstable(feature = "str_internals", issue = "none")]
|
mod lossy;
|
||||||
#[allow(missing_docs)]
|
#[unstable(feature = "utf8_chunks", issue = "99543")]
|
||||||
pub mod lossy;
|
pub use lossy::{Utf8Chunk, Utf8Chunks};
|
||||||
|
|
||||||
#[stable(feature = "rust1", since = "1.0.0")]
|
#[stable(feature = "rust1", since = "1.0.0")]
|
||||||
pub use converts::{from_utf8, from_utf8_unchecked};
|
pub use converts::{from_utf8, from_utf8_unchecked};
|
||||||
|
|
|
@ -96,6 +96,7 @@
|
||||||
#![feature(waker_getters)]
|
#![feature(waker_getters)]
|
||||||
#![feature(slice_flatten)]
|
#![feature(slice_flatten)]
|
||||||
#![feature(provide_any)]
|
#![feature(provide_any)]
|
||||||
|
#![feature(utf8_chunks)]
|
||||||
#![deny(unsafe_op_in_unsafe_fn)]
|
#![deny(unsafe_op_in_unsafe_fn)]
|
||||||
|
|
||||||
extern crate test;
|
extern crate test;
|
||||||
|
|
|
@ -1,85 +1,85 @@
|
||||||
use core::str::lossy::*;
|
use core::str::Utf8Chunks;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn chunks() {
|
fn chunks() {
|
||||||
let mut iter = Utf8Lossy::from_bytes(b"hello").chunks();
|
macro_rules! assert_chunks {
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "hello", broken: b"" }), iter.next());
|
( $string:expr, $(($valid:expr, $invalid:expr)),* $(,)? ) => {{
|
||||||
|
let mut iter = Utf8Chunks::new($string);
|
||||||
|
$(
|
||||||
|
let chunk = iter.next().expect("missing chunk");
|
||||||
|
assert_eq!($valid, chunk.valid());
|
||||||
|
assert_eq!($invalid, chunk.invalid());
|
||||||
|
)*
|
||||||
assert_eq!(None, iter.next());
|
assert_eq!(None, iter.next());
|
||||||
|
}};
|
||||||
|
}
|
||||||
|
|
||||||
let mut iter = Utf8Lossy::from_bytes("ศไทย中华Việt Nam".as_bytes()).chunks();
|
assert_chunks!(b"hello", ("hello", b""));
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "ศไทย中华Việt Nam", broken: b"" }), iter.next());
|
assert_chunks!("ศไทย中华Việt Nam".as_bytes(), ("ศไทย中华Việt Nam", b""));
|
||||||
assert_eq!(None, iter.next());
|
assert_chunks!(
|
||||||
|
b"Hello\xC2 There\xFF Goodbye",
|
||||||
let mut iter = Utf8Lossy::from_bytes(b"Hello\xC2 There\xFF Goodbye").chunks();
|
("Hello", b"\xC2"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "Hello", broken: b"\xC2" }), iter.next());
|
(" There", b"\xFF"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: " There", broken: b"\xFF" }), iter.next());
|
(" Goodbye", b""),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: " Goodbye", broken: b"" }), iter.next());
|
);
|
||||||
assert_eq!(None, iter.next());
|
assert_chunks!(
|
||||||
|
b"Hello\xC0\x80 There\xE6\x83 Goodbye",
|
||||||
let mut iter = Utf8Lossy::from_bytes(b"Hello\xC0\x80 There\xE6\x83 Goodbye").chunks();
|
("Hello", b"\xC0"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "Hello", broken: b"\xC0" }), iter.next());
|
("", b"\x80"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next());
|
(" There", b"\xE6\x83"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: " There", broken: b"\xE6\x83" }), iter.next());
|
(" Goodbye", b""),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: " Goodbye", broken: b"" }), iter.next());
|
);
|
||||||
assert_eq!(None, iter.next());
|
assert_chunks!(
|
||||||
|
b"\xF5foo\xF5\x80bar",
|
||||||
let mut iter = Utf8Lossy::from_bytes(b"\xF5foo\xF5\x80bar").chunks();
|
("", b"\xF5"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xF5" }), iter.next());
|
("foo", b"\xF5"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "foo", broken: b"\xF5" }), iter.next());
|
("", b"\x80"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next());
|
("bar", b""),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "bar", broken: b"" }), iter.next());
|
);
|
||||||
assert_eq!(None, iter.next());
|
assert_chunks!(
|
||||||
|
b"\xF1foo\xF1\x80bar\xF1\x80\x80baz",
|
||||||
let mut iter = Utf8Lossy::from_bytes(b"\xF1foo\xF1\x80bar\xF1\x80\x80baz").chunks();
|
("", b"\xF1"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xF1" }), iter.next());
|
("foo", b"\xF1\x80"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "foo", broken: b"\xF1\x80" }), iter.next());
|
("bar", b"\xF1\x80\x80"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "bar", broken: b"\xF1\x80\x80" }), iter.next());
|
("baz", b""),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "baz", broken: b"" }), iter.next());
|
);
|
||||||
assert_eq!(None, iter.next());
|
assert_chunks!(
|
||||||
|
b"\xF4foo\xF4\x80bar\xF4\xBFbaz",
|
||||||
let mut iter = Utf8Lossy::from_bytes(b"\xF4foo\xF4\x80bar\xF4\xBFbaz").chunks();
|
("", b"\xF4"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xF4" }), iter.next());
|
("foo", b"\xF4\x80"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "foo", broken: b"\xF4\x80" }), iter.next());
|
("bar", b"\xF4"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "bar", broken: b"\xF4" }), iter.next());
|
("", b"\xBF"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xBF" }), iter.next());
|
("baz", b""),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "baz", broken: b"" }), iter.next());
|
);
|
||||||
assert_eq!(None, iter.next());
|
assert_chunks!(
|
||||||
|
b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar",
|
||||||
let mut iter = Utf8Lossy::from_bytes(b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar").chunks();
|
("", b"\xF0"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xF0" }), iter.next());
|
("", b"\x80"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next());
|
("", b"\x80"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next());
|
("", b"\x80"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next());
|
("foo\u{10000}bar", b""),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "foo\u{10000}bar", broken: b"" }), iter.next());
|
);
|
||||||
assert_eq!(None, iter.next());
|
|
||||||
|
|
||||||
// surrogates
|
// surrogates
|
||||||
let mut iter = Utf8Lossy::from_bytes(b"\xED\xA0\x80foo\xED\xBF\xBFbar").chunks();
|
assert_chunks!(
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xED" }), iter.next());
|
b"\xED\xA0\x80foo\xED\xBF\xBFbar",
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xA0" }), iter.next());
|
("", b"\xED"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80" }), iter.next());
|
("", b"\xA0"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "foo", broken: b"\xED" }), iter.next());
|
("", b"\x80"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xBF" }), iter.next());
|
("foo", b"\xED"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xBF" }), iter.next());
|
("", b"\xBF"),
|
||||||
assert_eq!(Some(Utf8LossyChunk { valid: "bar", broken: b"" }), iter.next());
|
("", b"\xBF"),
|
||||||
assert_eq!(None, iter.next());
|
("bar", b""),
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn display() {
|
|
||||||
assert_eq!(
|
|
||||||
"Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye",
|
|
||||||
&Utf8Lossy::from_bytes(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string()
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn debug() {
|
fn debug() {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
"\"Hello\\xc0\\x80 There\\xe6\\x83 Goodbye\\u{10d4ea}\"",
|
"\"Hello\\xC0\\x80 There\\xE6\\x83 Goodbye\\u{10d4ea}\"",
|
||||||
&format!(
|
&format!(
|
||||||
"{:?}",
|
"{:?}",
|
||||||
Utf8Lossy::from_bytes(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa")
|
Utf8Chunks::new(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa").debug(),
|
||||||
)
|
),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
|
@ -258,6 +258,7 @@
|
||||||
#![feature(staged_api)]
|
#![feature(staged_api)]
|
||||||
#![feature(thread_local)]
|
#![feature(thread_local)]
|
||||||
#![feature(try_blocks)]
|
#![feature(try_blocks)]
|
||||||
|
#![feature(utf8_chunks)]
|
||||||
//
|
//
|
||||||
// Library features (core):
|
// Library features (core):
|
||||||
#![feature(array_error_internals)]
|
#![feature(array_error_internals)]
|
||||||
|
|
|
@ -11,7 +11,7 @@ use crate::str;
|
||||||
use crate::sync::Arc;
|
use crate::sync::Arc;
|
||||||
use crate::sys_common::{AsInner, IntoInner};
|
use crate::sys_common::{AsInner, IntoInner};
|
||||||
|
|
||||||
use core::str::lossy::{Utf8Lossy, Utf8LossyChunk};
|
use core::str::Utf8Chunks;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[path = "../unix/os_str/tests.rs"]
|
#[path = "../unix/os_str/tests.rs"]
|
||||||
|
@ -29,26 +29,32 @@ pub struct Slice {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Debug for Slice {
|
impl fmt::Debug for Slice {
|
||||||
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
// Writes out a valid unicode string with the correct escape sequences
|
fmt::Debug::fmt(&Utf8Chunks::new(&self.inner).debug(), f)
|
||||||
|
|
||||||
formatter.write_str("\"")?;
|
|
||||||
for Utf8LossyChunk { valid, broken } in Utf8Lossy::from_bytes(&self.inner).chunks() {
|
|
||||||
for c in valid.chars().flat_map(|c| c.escape_debug()) {
|
|
||||||
formatter.write_char(c)?
|
|
||||||
}
|
|
||||||
|
|
||||||
for b in broken {
|
|
||||||
write!(formatter, "\\x{:02X}", b)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
formatter.write_str("\"")
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Display for Slice {
|
impl fmt::Display for Slice {
|
||||||
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
fmt::Display::fmt(&Utf8Lossy::from_bytes(&self.inner), formatter)
|
// If we're the empty string then our iterator won't actually yield
|
||||||
|
// anything, so perform the formatting manually
|
||||||
|
if self.inner.is_empty() {
|
||||||
|
return "".fmt(f);
|
||||||
|
}
|
||||||
|
|
||||||
|
for chunk in Utf8Chunks::new(&self.inner) {
|
||||||
|
let valid = chunk.valid();
|
||||||
|
// If we successfully decoded the whole chunk as a valid string then
|
||||||
|
// we can return a direct formatting of the string which will also
|
||||||
|
// respect various formatting flags if possible.
|
||||||
|
if chunk.invalid().is_empty() {
|
||||||
|
return valid.fmt(f);
|
||||||
|
}
|
||||||
|
|
||||||
|
f.write_str(valid)?;
|
||||||
|
f.write_char(char::REPLACEMENT_CHARACTER)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -8,3 +8,11 @@ fn slice_debug_output() {
|
||||||
|
|
||||||
assert_eq!(output, expected);
|
assert_eq!(output, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn display() {
|
||||||
|
assert_eq!(
|
||||||
|
"Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye",
|
||||||
|
Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue