Rollup merge of #72683 - RalfJung:char-debug-check, r=Mark-Simulacrum
from_u32_unchecked: check validity, and fix UB in Wtf8 Fixes https://github.com/rust-lang/rust/issues/72760
This commit is contained in:
commit
3bbb475f00
5 changed files with 112 additions and 68 deletions
|
@ -99,7 +99,7 @@ pub fn from_u32(i: u32) -> Option<char> {
|
||||||
#[inline]
|
#[inline]
|
||||||
#[stable(feature = "char_from_unchecked", since = "1.5.0")]
|
#[stable(feature = "char_from_unchecked", since = "1.5.0")]
|
||||||
pub unsafe fn from_u32_unchecked(i: u32) -> char {
|
pub unsafe fn from_u32_unchecked(i: u32) -> char {
|
||||||
transmute(i)
|
if cfg!(debug_assertions) { char::from_u32(i).unwrap() } else { transmute(i) }
|
||||||
}
|
}
|
||||||
|
|
||||||
#[stable(feature = "char_convert", since = "1.13.0")]
|
#[stable(feature = "char_convert", since = "1.13.0")]
|
||||||
|
@ -218,7 +218,7 @@ impl TryFrom<u32> for char {
|
||||||
Err(CharTryFromError(()))
|
Err(CharTryFromError(()))
|
||||||
} else {
|
} else {
|
||||||
// SAFETY: checked that it's a legal unicode value
|
// SAFETY: checked that it's a legal unicode value
|
||||||
Ok(unsafe { from_u32_unchecked(i) })
|
Ok(unsafe { transmute(i) })
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -593,16 +593,7 @@ impl char {
|
||||||
#[stable(feature = "rust1", since = "1.0.0")]
|
#[stable(feature = "rust1", since = "1.0.0")]
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn len_utf8(self) -> usize {
|
pub fn len_utf8(self) -> usize {
|
||||||
let code = self as u32;
|
len_utf8(self as u32)
|
||||||
if code < MAX_ONE_B {
|
|
||||||
1
|
|
||||||
} else if code < MAX_TWO_B {
|
|
||||||
2
|
|
||||||
} else if code < MAX_THREE_B {
|
|
||||||
3
|
|
||||||
} else {
|
|
||||||
4
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the number of 16-bit code units this `char` would need if
|
/// Returns the number of 16-bit code units this `char` would need if
|
||||||
|
@ -670,36 +661,8 @@ impl char {
|
||||||
#[stable(feature = "unicode_encode_char", since = "1.15.0")]
|
#[stable(feature = "unicode_encode_char", since = "1.15.0")]
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
|
pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
|
||||||
let code = self as u32;
|
// SAFETY: `char` is not a surrogate, so this is valid UTF-8.
|
||||||
let len = self.len_utf8();
|
unsafe { from_utf8_unchecked_mut(encode_utf8_raw(self as u32, dst)) }
|
||||||
match (len, &mut dst[..]) {
|
|
||||||
(1, [a, ..]) => {
|
|
||||||
*a = code as u8;
|
|
||||||
}
|
|
||||||
(2, [a, b, ..]) => {
|
|
||||||
*a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
|
|
||||||
*b = (code & 0x3F) as u8 | TAG_CONT;
|
|
||||||
}
|
|
||||||
(3, [a, b, c, ..]) => {
|
|
||||||
*a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
|
|
||||||
*b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
|
|
||||||
*c = (code & 0x3F) as u8 | TAG_CONT;
|
|
||||||
}
|
|
||||||
(4, [a, b, c, d, ..]) => {
|
|
||||||
*a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
|
|
||||||
*b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
|
|
||||||
*c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
|
|
||||||
*d = (code & 0x3F) as u8 | TAG_CONT;
|
|
||||||
}
|
|
||||||
_ => panic!(
|
|
||||||
"encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
|
|
||||||
len,
|
|
||||||
code,
|
|
||||||
dst.len(),
|
|
||||||
),
|
|
||||||
};
|
|
||||||
// SAFETY: We just wrote UTF-8 content in, so converting to str is fine.
|
|
||||||
unsafe { from_utf8_unchecked_mut(&mut dst[..len]) }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Encodes this character as UTF-16 into the provided `u16` buffer,
|
/// Encodes this character as UTF-16 into the provided `u16` buffer,
|
||||||
|
@ -739,28 +702,7 @@ impl char {
|
||||||
#[stable(feature = "unicode_encode_char", since = "1.15.0")]
|
#[stable(feature = "unicode_encode_char", since = "1.15.0")]
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] {
|
pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] {
|
||||||
let mut code = self as u32;
|
encode_utf16_raw(self as u32, dst)
|
||||||
// SAFETY: each arm checks whether there are enough bits to write into
|
|
||||||
unsafe {
|
|
||||||
if (code & 0xFFFF) == code && !dst.is_empty() {
|
|
||||||
// The BMP falls through (assuming non-surrogate, as it should)
|
|
||||||
*dst.get_unchecked_mut(0) = code as u16;
|
|
||||||
slice::from_raw_parts_mut(dst.as_mut_ptr(), 1)
|
|
||||||
} else if dst.len() >= 2 {
|
|
||||||
// Supplementary planes break into surrogates.
|
|
||||||
code -= 0x1_0000;
|
|
||||||
*dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16);
|
|
||||||
*dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF);
|
|
||||||
slice::from_raw_parts_mut(dst.as_mut_ptr(), 2)
|
|
||||||
} else {
|
|
||||||
panic!(
|
|
||||||
"encode_utf16: need {} units to encode U+{:X}, but the buffer has {}",
|
|
||||||
from_u32_unchecked(code).len_utf16(),
|
|
||||||
code,
|
|
||||||
dst.len(),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns `true` if this `char` has the `Alphabetic` property.
|
/// Returns `true` if this `char` has the `Alphabetic` property.
|
||||||
|
@ -1673,3 +1615,100 @@ impl char {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn len_utf8(code: u32) -> usize {
|
||||||
|
if code < MAX_ONE_B {
|
||||||
|
1
|
||||||
|
} else if code < MAX_TWO_B {
|
||||||
|
2
|
||||||
|
} else if code < MAX_THREE_B {
|
||||||
|
3
|
||||||
|
} else {
|
||||||
|
4
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
|
||||||
|
/// and then returns the subslice of the buffer that contains the encoded character.
|
||||||
|
///
|
||||||
|
/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
|
||||||
|
/// (Creating a `char` in the surrogate range is UB.)
|
||||||
|
/// The result is valid [generalized UTF-8] but not valid UTF-8.
|
||||||
|
///
|
||||||
|
/// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
|
||||||
|
///
|
||||||
|
/// # Panics
|
||||||
|
///
|
||||||
|
/// Panics if the buffer is not large enough.
|
||||||
|
/// A buffer of length four is large enough to encode any `char`.
|
||||||
|
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
|
||||||
|
#[doc(hidden)]
|
||||||
|
#[inline]
|
||||||
|
pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
|
||||||
|
let len = len_utf8(code);
|
||||||
|
match (len, &mut dst[..]) {
|
||||||
|
(1, [a, ..]) => {
|
||||||
|
*a = code as u8;
|
||||||
|
}
|
||||||
|
(2, [a, b, ..]) => {
|
||||||
|
*a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
|
||||||
|
*b = (code & 0x3F) as u8 | TAG_CONT;
|
||||||
|
}
|
||||||
|
(3, [a, b, c, ..]) => {
|
||||||
|
*a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
|
||||||
|
*b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
|
||||||
|
*c = (code & 0x3F) as u8 | TAG_CONT;
|
||||||
|
}
|
||||||
|
(4, [a, b, c, d, ..]) => {
|
||||||
|
*a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
|
||||||
|
*b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
|
||||||
|
*c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
|
||||||
|
*d = (code & 0x3F) as u8 | TAG_CONT;
|
||||||
|
}
|
||||||
|
_ => panic!(
|
||||||
|
"encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
|
||||||
|
len,
|
||||||
|
code,
|
||||||
|
dst.len(),
|
||||||
|
),
|
||||||
|
};
|
||||||
|
&mut dst[..len]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
|
||||||
|
/// and then returns the subslice of the buffer that contains the encoded character.
|
||||||
|
///
|
||||||
|
/// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range.
|
||||||
|
/// (Creating a `char` in the surrogate range is UB.)
|
||||||
|
///
|
||||||
|
/// # Panics
|
||||||
|
///
|
||||||
|
/// Panics if the buffer is not large enough.
|
||||||
|
/// A buffer of length 2 is large enough to encode any `char`.
|
||||||
|
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
|
||||||
|
#[doc(hidden)]
|
||||||
|
#[inline]
|
||||||
|
pub fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] {
|
||||||
|
// SAFETY: each arm checks whether there are enough bits to write into
|
||||||
|
unsafe {
|
||||||
|
if (code & 0xFFFF) == code && !dst.is_empty() {
|
||||||
|
// The BMP falls through
|
||||||
|
*dst.get_unchecked_mut(0) = code as u16;
|
||||||
|
slice::from_raw_parts_mut(dst.as_mut_ptr(), 1)
|
||||||
|
} else if dst.len() >= 2 {
|
||||||
|
// Supplementary planes break into surrogates.
|
||||||
|
code -= 0x1_0000;
|
||||||
|
*dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16);
|
||||||
|
*dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF);
|
||||||
|
slice::from_raw_parts_mut(dst.as_mut_ptr(), 2)
|
||||||
|
} else {
|
||||||
|
panic!(
|
||||||
|
"encode_utf16: need {} units to encode U+{:X}, but the buffer has {}",
|
||||||
|
from_u32_unchecked(code).len_utf16(),
|
||||||
|
code,
|
||||||
|
dst.len(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -37,6 +37,12 @@ pub use self::decode::{decode_utf16, DecodeUtf16, DecodeUtf16Error};
|
||||||
#[stable(feature = "unicode_version", since = "1.45.0")]
|
#[stable(feature = "unicode_version", since = "1.45.0")]
|
||||||
pub use crate::unicode::UNICODE_VERSION;
|
pub use crate::unicode::UNICODE_VERSION;
|
||||||
|
|
||||||
|
// perma-unstable re-exports
|
||||||
|
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
|
||||||
|
pub use self::methods::encode_utf16_raw;
|
||||||
|
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
|
||||||
|
pub use self::methods::encode_utf8_raw;
|
||||||
|
|
||||||
use crate::fmt::{self, Write};
|
use crate::fmt::{self, Write};
|
||||||
use crate::iter::FusedIterator;
|
use crate::iter::FusedIterator;
|
||||||
|
|
||||||
|
|
|
@ -247,6 +247,7 @@
|
||||||
#![feature(cfg_target_has_atomic)]
|
#![feature(cfg_target_has_atomic)]
|
||||||
#![feature(cfg_target_thread_local)]
|
#![feature(cfg_target_thread_local)]
|
||||||
#![feature(char_error_internals)]
|
#![feature(char_error_internals)]
|
||||||
|
#![feature(char_internals)]
|
||||||
#![feature(clamp)]
|
#![feature(clamp)]
|
||||||
#![feature(concat_idents)]
|
#![feature(concat_idents)]
|
||||||
#![feature(const_cstr_unchecked)]
|
#![feature(const_cstr_unchecked)]
|
||||||
|
|
|
@ -201,9 +201,8 @@ impl Wtf8Buf {
|
||||||
/// Copied from String::push
|
/// Copied from String::push
|
||||||
/// This does **not** include the WTF-8 concatenation check.
|
/// This does **not** include the WTF-8 concatenation check.
|
||||||
fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
|
fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
|
||||||
let c = unsafe { char::from_u32_unchecked(code_point.value) };
|
|
||||||
let mut bytes = [0; 4];
|
let mut bytes = [0; 4];
|
||||||
let bytes = c.encode_utf8(&mut bytes).as_bytes();
|
let bytes = char::encode_utf8_raw(code_point.value, &mut bytes);
|
||||||
self.bytes.extend_from_slice(bytes)
|
self.bytes.extend_from_slice(bytes)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -840,8 +839,7 @@ impl<'a> Iterator for EncodeWide<'a> {
|
||||||
|
|
||||||
let mut buf = [0; 2];
|
let mut buf = [0; 2];
|
||||||
self.code_points.next().map(|code_point| {
|
self.code_points.next().map(|code_point| {
|
||||||
let c = unsafe { char::from_u32_unchecked(code_point.value) };
|
let n = char::encode_utf16_raw(code_point.value, &mut buf).len();
|
||||||
let n = c.encode_utf16(&mut buf).len();
|
|
||||||
if n == 2 {
|
if n == 2 {
|
||||||
self.extra = buf[1];
|
self.extra = buf[1];
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue