Rollup merge of #72683 - RalfJung:char-debug-check, r=Mark-Simulacrum

from_u32_unchecked: check validity, and fix UB in Wtf8 Fixes https://github.com/rust-lang/rust/issues/72760
2020-05-31 12:03:22 +02:00 · 2020-05-31 12:03:22 +02:00 · 3bbb475f00
commit 3bbb475f00
parent b6fa392238 0fb6e63c04
5 changed files with 112 additions and 68 deletions
--- a/src/libcore/char/convert.rs
+++ b/src/libcore/char/convert.rs
@ -99,7 +99,7 @@ pub fn from_u32(i: u32) -> Option<char> {
 #[inline]
 #[stable(feature = "char_from_unchecked", since = "1.5.0")]
 pub unsafe fn from_u32_unchecked(i: u32) -> char {
-    transmute(i)
+    if cfg!(debug_assertions) { char::from_u32(i).unwrap() } else { transmute(i) }
 }
 #[stable(feature = "char_convert", since = "1.13.0")]
@ -218,7 +218,7 @@ impl TryFrom<u32> for char {
            Err(CharTryFromError(()))
        } else {
            // SAFETY: checked that it's a legal unicode value
-            Ok(unsafe { from_u32_unchecked(i) })
+            Ok(unsafe { transmute(i) })
        }
    }
 }
--- a/src/libcore/char/methods.rs
+++ b/src/libcore/char/methods.rs
@ -593,16 +593,7 @@ impl char {
    #[stable(feature = "rust1", since = "1.0.0")]
    #[inline]
    pub fn len_utf8(self) -> usize {
-        let code = self as u32;
+        len_utf8(self as u32)
        if code < MAX_ONE_B {
            1
        } else if code < MAX_TWO_B {
            2
        } else if code < MAX_THREE_B {
            3
        } else {
            4
        }
    }
    /// Returns the number of 16-bit code units this `char` would need if
@ -670,36 +661,8 @@ impl char {
    #[stable(feature = "unicode_encode_char", since = "1.15.0")]
    #[inline]
    pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
-        let code = self as u32;
+        // SAFETY: `char` is not a surrogate, so this is valid UTF-8.
-        let len = self.len_utf8();
+        unsafe { from_utf8_unchecked_mut(encode_utf8_raw(self as u32, dst)) }
        match (len, &mut dst[..]) {
            (1, [a, ..]) => {
                *a = code as u8;
            }
            (2, [a, b, ..]) => {
                *a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
                *b = (code & 0x3F) as u8 | TAG_CONT;
            }
            (3, [a, b, c, ..]) => {
                *a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
                *b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
                *c = (code & 0x3F) as u8 | TAG_CONT;
            }
            (4, [a, b, c, d, ..]) => {
                *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
                *b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
                *c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
                *d = (code & 0x3F) as u8 | TAG_CONT;
            }
            _ => panic!(
                "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
                len,
                code,
                dst.len(),
            ),
        };
        // SAFETY: We just wrote UTF-8 content in, so converting to str is fine.
        unsafe { from_utf8_unchecked_mut(&mut dst[..len]) }
    }
    /// Encodes this character as UTF-16 into the provided `u16` buffer,
@ -739,28 +702,7 @@ impl char {
    #[stable(feature = "unicode_encode_char", since = "1.15.0")]
    #[inline]
    pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] {
-        let mut code = self as u32;
+        encode_utf16_raw(self as u32, dst)
        // SAFETY: each arm checks whether there are enough bits to write into
        unsafe {
            if (code & 0xFFFF) == code && !dst.is_empty() {
                // The BMP falls through (assuming non-surrogate, as it should)
                *dst.get_unchecked_mut(0) = code as u16;
                slice::from_raw_parts_mut(dst.as_mut_ptr(), 1)
            } else if dst.len() >= 2 {
                // Supplementary planes break into surrogates.
                code -= 0x1_0000;
                *dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16);
                *dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF);
                slice::from_raw_parts_mut(dst.as_mut_ptr(), 2)
            } else {
                panic!(
                    "encode_utf16: need {} units to encode U+{:X}, but the buffer has {}",
                    from_u32_unchecked(code).len_utf16(),
                    code,
                    dst.len(),
                )
            }
        }
    }
    /// Returns `true` if this `char` has the `Alphabetic` property.
@ -1673,3 +1615,100 @@ impl char {
        }
    }
 }
 #[inline]
 fn len_utf8(code: u32) -> usize {
    if code < MAX_ONE_B {
        1
    } else if code < MAX_TWO_B {
        2
    } else if code < MAX_THREE_B {
        3
    } else {
        4
    }
 }
 /// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
 /// and then returns the subslice of the buffer that contains the encoded character.
 ///
 /// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
 /// (Creating a `char` in the surrogate range is UB.)
 /// The result is valid [generalized UTF-8] but not valid UTF-8.
 ///
 /// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
 ///
 /// # Panics
 ///
 /// Panics if the buffer is not large enough.
 /// A buffer of length four is large enough to encode any `char`.
 #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
 #[doc(hidden)]
 #[inline]
 pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
    let len = len_utf8(code);
    match (len, &mut dst[..]) {
        (1, [a, ..]) => {
            *a = code as u8;
        }
        (2, [a, b, ..]) => {
            *a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
            *b = (code & 0x3F) as u8 | TAG_CONT;
        }
        (3, [a, b, c, ..]) => {
            *a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
            *b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
            *c = (code & 0x3F) as u8 | TAG_CONT;
        }
        (4, [a, b, c, d, ..]) => {
            *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
            *b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
            *c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
            *d = (code & 0x3F) as u8 | TAG_CONT;
        }
        _ => panic!(
            "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
            len,
            code,
            dst.len(),
        ),
    };
    &mut dst[..len]
 }
 /// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
 /// and then returns the subslice of the buffer that contains the encoded character.
 ///
 /// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range.
 /// (Creating a `char` in the surrogate range is UB.)
 ///
 /// # Panics
 ///
 /// Panics if the buffer is not large enough.
 /// A buffer of length 2 is large enough to encode any `char`.
 #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
 #[doc(hidden)]
 #[inline]
 pub fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] {
    // SAFETY: each arm checks whether there are enough bits to write into
    unsafe {
        if (code & 0xFFFF) == code && !dst.is_empty() {
            // The BMP falls through
            *dst.get_unchecked_mut(0) = code as u16;
            slice::from_raw_parts_mut(dst.as_mut_ptr(), 1)
        } else if dst.len() >= 2 {
            // Supplementary planes break into surrogates.
            code -= 0x1_0000;
            *dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16);
            *dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF);
            slice::from_raw_parts_mut(dst.as_mut_ptr(), 2)
        } else {
            panic!(
                "encode_utf16: need {} units to encode U+{:X}, but the buffer has {}",
                from_u32_unchecked(code).len_utf16(),
                code,
                dst.len(),
            )
        }
    }
 }
--- a/src/libcore/char/mod.rs
+++ b/src/libcore/char/mod.rs
@ -37,6 +37,12 @@ pub use self::decode::{decode_utf16, DecodeUtf16, DecodeUtf16Error};
 #[stable(feature = "unicode_version", since = "1.45.0")]
 pub use crate::unicode::UNICODE_VERSION;
 // perma-unstable re-exports
 #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
 pub use self::methods::encode_utf16_raw;
 #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
 pub use self::methods::encode_utf8_raw;
 use crate::fmt::{self, Write};
 use crate::iter::FusedIterator;
--- a/src/libstd/lib.rs
+++ b/src/libstd/lib.rs
@ -247,6 +247,7 @@
 #![feature(cfg_target_has_atomic)]
 #![feature(cfg_target_thread_local)]
 #![feature(char_error_internals)]
 #![feature(char_internals)]
 #![feature(clamp)]
 #![feature(concat_idents)]
 #![feature(const_cstr_unchecked)]
--- a/src/libstd/sys_common/wtf8.rs
+++ b/src/libstd/sys_common/wtf8.rs
@ -201,9 +201,8 @@ impl Wtf8Buf {
    /// Copied from String::push
    /// This does **not** include the WTF-8 concatenation check.
    fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
        let c = unsafe { char::from_u32_unchecked(code_point.value) };
        let mut bytes = [0; 4];
-        let bytes = c.encode_utf8(&mut bytes).as_bytes();
+        let bytes = char::encode_utf8_raw(code_point.value, &mut bytes);
        self.bytes.extend_from_slice(bytes)
    }
@ -840,8 +839,7 @@ impl<'a> Iterator for EncodeWide<'a> {
        let mut buf = [0; 2];
        self.code_points.next().map(|code_point| {
-            let c = unsafe { char::from_u32_unchecked(code_point.value) };
+            let n = char::encode_utf16_raw(code_point.value, &mut buf).len();
            let n = c.encode_utf16(&mut buf).len();
            if n == 2 {
                self.extra = buf[1];
            }