From fea1bd4cdf375fa097789b2484b746cacdfddb73 Mon Sep 17 00:00:00 2001 From: Brian Anderson Date: Thu, 22 Sep 2016 00:10:37 +0000 Subject: [PATCH] std: Move platform specific memchr code into sys --- src/libstd/memchr.rs | 275 +------------------------------ src/libstd/sys/common/memchr.rs | 230 ++++++++++++++++++++++++++ src/libstd/sys/common/mod.rs | 1 + src/libstd/sys/unix/memchr.rs | 57 +++++++ src/libstd/sys/unix/mod.rs | 1 + src/libstd/sys/windows/memchr.rs | 15 ++ src/libstd/sys/windows/mod.rs | 1 + 7 files changed, 309 insertions(+), 271 deletions(-) create mode 100644 src/libstd/sys/common/memchr.rs create mode 100644 src/libstd/sys/unix/memchr.rs create mode 100644 src/libstd/sys/windows/memchr.rs diff --git a/src/libstd/memchr.rs b/src/libstd/memchr.rs index 03f55f7ad61..7c8c97a6caf 100644 --- a/src/libstd/memchr.rs +++ b/src/libstd/memchr.rs @@ -11,8 +11,6 @@ // Original implementation taken from rust-memchr // Copyright 2015 Andrew Gallant, bluss and Nicolas Koch - - /// A safe interface to `memchr`. /// /// Returns the index corresponding to the first occurrence of `needle` in @@ -32,32 +30,9 @@ /// let haystack = b"the quick brown fox"; /// assert_eq!(memchr(b'k', haystack), Some(8)); /// ``` +#[inline] pub fn memchr(needle: u8, haystack: &[u8]) -> Option { - // libc memchr - #[cfg(not(target_os = "windows"))] - fn memchr_specific(needle: u8, haystack: &[u8]) -> Option { - use libc; - - let p = unsafe { - libc::memchr( - haystack.as_ptr() as *const libc::c_void, - needle as libc::c_int, - haystack.len() as libc::size_t) - }; - if p.is_null() { - None - } else { - Some(p as usize - (haystack.as_ptr() as usize)) - } - } - - // use fallback on windows, since it's faster - #[cfg(target_os = "windows")] - fn memchr_specific(needle: u8, haystack: &[u8]) -> Option { - fallback::memchr(needle, haystack) - } - - memchr_specific(needle, haystack) + ::sys::memchr::memchr(needle, haystack) } /// A safe interface to `memrchr`. @@ -75,251 +50,9 @@ pub fn memchr(needle: u8, haystack: &[u8]) -> Option { /// let haystack = b"the quick brown fox"; /// assert_eq!(memrchr(b'o', haystack), Some(17)); /// ``` +#[inline] pub fn memrchr(needle: u8, haystack: &[u8]) -> Option { - - #[cfg(target_os = "linux")] - fn memrchr_specific(needle: u8, haystack: &[u8]) -> Option { - use libc; - - // GNU's memrchr() will - unlike memchr() - error if haystack is empty. - if haystack.is_empty() {return None} - let p = unsafe { - libc::memrchr( - haystack.as_ptr() as *const libc::c_void, - needle as libc::c_int, - haystack.len() as libc::size_t) - }; - if p.is_null() { - None - } else { - Some(p as usize - (haystack.as_ptr() as usize)) - } - } - - #[cfg(not(target_os = "linux"))] - fn memrchr_specific(needle: u8, haystack: &[u8]) -> Option { - fallback::memrchr(needle, haystack) - } - - memrchr_specific(needle, haystack) -} - -#[allow(dead_code)] -mod fallback { - use cmp; - use mem; - - const LO_U64: u64 = 0x0101010101010101; - const HI_U64: u64 = 0x8080808080808080; - - // use truncation - const LO_USIZE: usize = LO_U64 as usize; - const HI_USIZE: usize = HI_U64 as usize; - - /// Return `true` if `x` contains any zero byte. - /// - /// From *Matters Computational*, J. Arndt - /// - /// "The idea is to subtract one from each of the bytes and then look for - /// bytes where the borrow propagated all the way to the most significant - /// bit." - #[inline] - fn contains_zero_byte(x: usize) -> bool { - x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0 - } - - #[cfg(target_pointer_width = "32")] - #[inline] - fn repeat_byte(b: u8) -> usize { - let mut rep = (b as usize) << 8 | b as usize; - rep = rep << 16 | rep; - rep - } - - #[cfg(target_pointer_width = "64")] - #[inline] - fn repeat_byte(b: u8) -> usize { - let mut rep = (b as usize) << 8 | b as usize; - rep = rep << 16 | rep; - rep = rep << 32 | rep; - rep - } - - /// Return the first index matching the byte `a` in `text`. - pub fn memchr(x: u8, text: &[u8]) -> Option { - // Scan for a single byte value by reading two `usize` words at a time. - // - // Split `text` in three parts - // - unaligned initial part, before the first word aligned address in text - // - body, scan by 2 words at a time - // - the last remaining part, < 2 word size - let len = text.len(); - let ptr = text.as_ptr(); - let usize_bytes = mem::size_of::(); - - // search up to an aligned boundary - let align = (ptr as usize) & (usize_bytes- 1); - let mut offset; - if align > 0 { - offset = cmp::min(usize_bytes - align, len); - if let Some(index) = text[..offset].iter().position(|elt| *elt == x) { - return Some(index); - } - } else { - offset = 0; - } - - // search the body of the text - let repeated_x = repeat_byte(x); - - if len >= 2 * usize_bytes { - while offset <= len - 2 * usize_bytes { - unsafe { - let u = *(ptr.offset(offset as isize) as *const usize); - let v = *(ptr.offset((offset + usize_bytes) as isize) as *const usize); - - // break if there is a matching byte - let zu = contains_zero_byte(u ^ repeated_x); - let zv = contains_zero_byte(v ^ repeated_x); - if zu || zv { - break; - } - } - offset += usize_bytes * 2; - } - } - - // find the byte after the point the body loop stopped - text[offset..].iter().position(|elt| *elt == x).map(|i| offset + i) - } - - /// Return the last index matching the byte `a` in `text`. - pub fn memrchr(x: u8, text: &[u8]) -> Option { - // Scan for a single byte value by reading two `usize` words at a time. - // - // Split `text` in three parts - // - unaligned tail, after the last word aligned address in text - // - body, scan by 2 words at a time - // - the first remaining bytes, < 2 word size - let len = text.len(); - let ptr = text.as_ptr(); - let usize_bytes = mem::size_of::(); - - // search to an aligned boundary - let end_align = (ptr as usize + len) & (usize_bytes - 1); - let mut offset; - if end_align > 0 { - offset = if end_align >= len { 0 } else { len - end_align }; - if let Some(index) = text[offset..].iter().rposition(|elt| *elt == x) { - return Some(offset + index); - } - } else { - offset = len; - } - - // search the body of the text - let repeated_x = repeat_byte(x); - - while offset >= 2 * usize_bytes { - unsafe { - let u = *(ptr.offset(offset as isize - 2 * usize_bytes as isize) as *const usize); - let v = *(ptr.offset(offset as isize - usize_bytes as isize) as *const usize); - - // break if there is a matching byte - let zu = contains_zero_byte(u ^ repeated_x); - let zv = contains_zero_byte(v ^ repeated_x); - if zu || zv { - break; - } - } - offset -= 2 * usize_bytes; - } - - // find the byte before the point the body loop stopped - text[..offset].iter().rposition(|elt| *elt == x) - } - - // test fallback implementations on all platforms - #[test] - fn matches_one() { - assert_eq!(Some(0), memchr(b'a', b"a")); - } - - #[test] - fn matches_begin() { - assert_eq!(Some(0), memchr(b'a', b"aaaa")); - } - - #[test] - fn matches_end() { - assert_eq!(Some(4), memchr(b'z', b"aaaaz")); - } - - #[test] - fn matches_nul() { - assert_eq!(Some(4), memchr(b'\x00', b"aaaa\x00")); - } - - #[test] - fn matches_past_nul() { - assert_eq!(Some(5), memchr(b'z', b"aaaa\x00z")); - } - - #[test] - fn no_match_empty() { - assert_eq!(None, memchr(b'a', b"")); - } - - #[test] - fn no_match() { - assert_eq!(None, memchr(b'a', b"xyz")); - } - - #[test] - fn matches_one_reversed() { - assert_eq!(Some(0), memrchr(b'a', b"a")); - } - - #[test] - fn matches_begin_reversed() { - assert_eq!(Some(3), memrchr(b'a', b"aaaa")); - } - - #[test] - fn matches_end_reversed() { - assert_eq!(Some(0), memrchr(b'z', b"zaaaa")); - } - - #[test] - fn matches_nul_reversed() { - assert_eq!(Some(4), memrchr(b'\x00', b"aaaa\x00")); - } - - #[test] - fn matches_past_nul_reversed() { - assert_eq!(Some(0), memrchr(b'z', b"z\x00aaaa")); - } - - #[test] - fn no_match_empty_reversed() { - assert_eq!(None, memrchr(b'a', b"")); - } - - #[test] - fn no_match_reversed() { - assert_eq!(None, memrchr(b'a', b"xyz")); - } - - #[test] - fn each_alignment_reversed() { - let mut data = [1u8; 64]; - let needle = 2; - let pos = 40; - data[pos] = needle; - for start in 0..16 { - assert_eq!(Some(pos - start), memrchr(needle, &data[start..])); - } - } + ::sys::memchr::memrchr(needle, haystack) } #[cfg(test)] diff --git a/src/libstd/sys/common/memchr.rs b/src/libstd/sys/common/memchr.rs new file mode 100644 index 00000000000..3824a5fb528 --- /dev/null +++ b/src/libstd/sys/common/memchr.rs @@ -0,0 +1,230 @@ +// Copyright 2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// Original implementation taken from rust-memchr +// Copyright 2015 Andrew Gallant, bluss and Nicolas Koch + +#[allow(dead_code)] +pub mod fallback { + use cmp; + use mem; + + const LO_U64: u64 = 0x0101010101010101; + const HI_U64: u64 = 0x8080808080808080; + + // use truncation + const LO_USIZE: usize = LO_U64 as usize; + const HI_USIZE: usize = HI_U64 as usize; + + /// Return `true` if `x` contains any zero byte. + /// + /// From *Matters Computational*, J. Arndt + /// + /// "The idea is to subtract one from each of the bytes and then look for + /// bytes where the borrow propagated all the way to the most significant + /// bit." + #[inline] + fn contains_zero_byte(x: usize) -> bool { + x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0 + } + + #[cfg(target_pointer_width = "32")] + #[inline] + fn repeat_byte(b: u8) -> usize { + let mut rep = (b as usize) << 8 | b as usize; + rep = rep << 16 | rep; + rep + } + + #[cfg(target_pointer_width = "64")] + #[inline] + fn repeat_byte(b: u8) -> usize { + let mut rep = (b as usize) << 8 | b as usize; + rep = rep << 16 | rep; + rep = rep << 32 | rep; + rep + } + + /// Return the first index matching the byte `a` in `text`. + pub fn memchr(x: u8, text: &[u8]) -> Option { + // Scan for a single byte value by reading two `usize` words at a time. + // + // Split `text` in three parts + // - unaligned initial part, before the first word aligned address in text + // - body, scan by 2 words at a time + // - the last remaining part, < 2 word size + let len = text.len(); + let ptr = text.as_ptr(); + let usize_bytes = mem::size_of::(); + + // search up to an aligned boundary + let align = (ptr as usize) & (usize_bytes- 1); + let mut offset; + if align > 0 { + offset = cmp::min(usize_bytes - align, len); + if let Some(index) = text[..offset].iter().position(|elt| *elt == x) { + return Some(index); + } + } else { + offset = 0; + } + + // search the body of the text + let repeated_x = repeat_byte(x); + + if len >= 2 * usize_bytes { + while offset <= len - 2 * usize_bytes { + unsafe { + let u = *(ptr.offset(offset as isize) as *const usize); + let v = *(ptr.offset((offset + usize_bytes) as isize) as *const usize); + + // break if there is a matching byte + let zu = contains_zero_byte(u ^ repeated_x); + let zv = contains_zero_byte(v ^ repeated_x); + if zu || zv { + break; + } + } + offset += usize_bytes * 2; + } + } + + // find the byte after the point the body loop stopped + text[offset..].iter().position(|elt| *elt == x).map(|i| offset + i) + } + + /// Return the last index matching the byte `a` in `text`. + pub fn memrchr(x: u8, text: &[u8]) -> Option { + // Scan for a single byte value by reading two `usize` words at a time. + // + // Split `text` in three parts + // - unaligned tail, after the last word aligned address in text + // - body, scan by 2 words at a time + // - the first remaining bytes, < 2 word size + let len = text.len(); + let ptr = text.as_ptr(); + let usize_bytes = mem::size_of::(); + + // search to an aligned boundary + let end_align = (ptr as usize + len) & (usize_bytes - 1); + let mut offset; + if end_align > 0 { + offset = if end_align >= len { 0 } else { len - end_align }; + if let Some(index) = text[offset..].iter().rposition(|elt| *elt == x) { + return Some(offset + index); + } + } else { + offset = len; + } + + // search the body of the text + let repeated_x = repeat_byte(x); + + while offset >= 2 * usize_bytes { + unsafe { + let u = *(ptr.offset(offset as isize - 2 * usize_bytes as isize) as *const usize); + let v = *(ptr.offset(offset as isize - usize_bytes as isize) as *const usize); + + // break if there is a matching byte + let zu = contains_zero_byte(u ^ repeated_x); + let zv = contains_zero_byte(v ^ repeated_x); + if zu || zv { + break; + } + } + offset -= 2 * usize_bytes; + } + + // find the byte before the point the body loop stopped + text[..offset].iter().rposition(|elt| *elt == x) + } + + // test fallback implementations on all platforms + #[test] + fn matches_one() { + assert_eq!(Some(0), memchr(b'a', b"a")); + } + + #[test] + fn matches_begin() { + assert_eq!(Some(0), memchr(b'a', b"aaaa")); + } + + #[test] + fn matches_end() { + assert_eq!(Some(4), memchr(b'z', b"aaaaz")); + } + + #[test] + fn matches_nul() { + assert_eq!(Some(4), memchr(b'\x00', b"aaaa\x00")); + } + + #[test] + fn matches_past_nul() { + assert_eq!(Some(5), memchr(b'z', b"aaaa\x00z")); + } + + #[test] + fn no_match_empty() { + assert_eq!(None, memchr(b'a', b"")); + } + + #[test] + fn no_match() { + assert_eq!(None, memchr(b'a', b"xyz")); + } + + #[test] + fn matches_one_reversed() { + assert_eq!(Some(0), memrchr(b'a', b"a")); + } + + #[test] + fn matches_begin_reversed() { + assert_eq!(Some(3), memrchr(b'a', b"aaaa")); + } + + #[test] + fn matches_end_reversed() { + assert_eq!(Some(0), memrchr(b'z', b"zaaaa")); + } + + #[test] + fn matches_nul_reversed() { + assert_eq!(Some(4), memrchr(b'\x00', b"aaaa\x00")); + } + + #[test] + fn matches_past_nul_reversed() { + assert_eq!(Some(0), memrchr(b'z', b"z\x00aaaa")); + } + + #[test] + fn no_match_empty_reversed() { + assert_eq!(None, memrchr(b'a', b"")); + } + + #[test] + fn no_match_reversed() { + assert_eq!(None, memrchr(b'a', b"xyz")); + } + + #[test] + fn each_alignment_reversed() { + let mut data = [1u8; 64]; + let needle = 2; + let pos = 40; + data[pos] = needle; + for start in 0..16 { + assert_eq!(Some(pos - start), memrchr(needle, &data[start..])); + } + } +} diff --git a/src/libstd/sys/common/mod.rs b/src/libstd/sys/common/mod.rs index d1ca6765107..9a46c617b97 100644 --- a/src/libstd/sys/common/mod.rs +++ b/src/libstd/sys/common/mod.rs @@ -31,6 +31,7 @@ pub mod at_exit_imp; pub mod backtrace; pub mod condvar; pub mod io; +pub mod memchr; pub mod mutex; pub mod net; pub mod poison; diff --git a/src/libstd/sys/unix/memchr.rs b/src/libstd/sys/unix/memchr.rs new file mode 100644 index 00000000000..5038e0750c8 --- /dev/null +++ b/src/libstd/sys/unix/memchr.rs @@ -0,0 +1,57 @@ +// Copyright 2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// Original implementation taken from rust-memchr +// Copyright 2015 Andrew Gallant, bluss and Nicolas Koch + +pub fn memchr(needle: u8, haystack: &[u8]) -> Option { + use libc; + + let p = unsafe { + libc::memchr( + haystack.as_ptr() as *const libc::c_void, + needle as libc::c_int, + haystack.len() as libc::size_t) + }; + if p.is_null() { + None + } else { + Some(p as usize - (haystack.as_ptr() as usize)) + } +} + +pub fn memrchr(needle: u8, haystack: &[u8]) -> Option { + + #[cfg(target_os = "linux")] + fn memrchr_specific(needle: u8, haystack: &[u8]) -> Option { + use libc; + + // GNU's memrchr() will - unlike memchr() - error if haystack is empty. + if haystack.is_empty() {return None} + let p = unsafe { + libc::memrchr( + haystack.as_ptr() as *const libc::c_void, + needle as libc::c_int, + haystack.len() as libc::size_t) + }; + if p.is_null() { + None + } else { + Some(p as usize - (haystack.as_ptr() as usize)) + } + } + + #[cfg(not(target_os = "linux"))] + fn memrchr_specific(needle: u8, haystack: &[u8]) -> Option { + ::sys_common::memchr::fallback::memrchr(needle, haystack) + } + + memrchr_specific(needle, haystack) +} diff --git a/src/libstd/sys/unix/mod.rs b/src/libstd/sys/unix/mod.rs index abe7018f25d..c4763e1739e 100644 --- a/src/libstd/sys/unix/mod.rs +++ b/src/libstd/sys/unix/mod.rs @@ -38,6 +38,7 @@ pub mod env; pub mod ext; pub mod fd; pub mod fs; +pub mod memchr; pub mod mutex; pub mod net; pub mod os; diff --git a/src/libstd/sys/windows/memchr.rs b/src/libstd/sys/windows/memchr.rs new file mode 100644 index 00000000000..5a5386acaa5 --- /dev/null +++ b/src/libstd/sys/windows/memchr.rs @@ -0,0 +1,15 @@ +// Copyright 2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// Original implementation taken from rust-memchr +// Copyright 2015 Andrew Gallant, bluss and Nicolas Koch + +// Fallback memchr is fastest on windows +pub use sys_common::memchr::fallback::{memchr, memrchr}; diff --git a/src/libstd/sys/windows/mod.rs b/src/libstd/sys/windows/mod.rs index c31726a4a25..df9cef33927 100644 --- a/src/libstd/sys/windows/mod.rs +++ b/src/libstd/sys/windows/mod.rs @@ -26,6 +26,7 @@ pub mod env; pub mod ext; pub mod fs; pub mod handle; +pub mod memchr; pub mod mutex; pub mod net; pub mod os;