Switched to the two-way algorithm for string searching
test str::bench::bench_contains_bad_naive ... bench: 300 ns/iter (+/- 12) from 1309 ns/iter (+/- 36) test str::bench::bench_contains_equal ... bench: 154 ns/iter (+/- 7) from 137 ns/iter (+/- 2) test str::bench::bench_contains_short_long ... bench: 2998 ns/iter (+/- 74) from 5473 ns/iter (+/- 14) test str::bench::bench_contains_short_short ... bench: 65 ns/iter (+/- 2) from 57 ns/iter (+/- 6)
This commit is contained in:
parent
8a32a2a872
commit
39cb5b13e6
1 changed files with 206 additions and 26 deletions
|
@ -15,16 +15,19 @@
|
||||||
use mem;
|
use mem;
|
||||||
use char;
|
use char;
|
||||||
use clone::Clone;
|
use clone::Clone;
|
||||||
|
use cmp;
|
||||||
use cmp::{Eq, TotalEq};
|
use cmp::{Eq, TotalEq};
|
||||||
use container::Container;
|
use container::Container;
|
||||||
use default::Default;
|
use default::Default;
|
||||||
use iter::{Filter, Map, Iterator};
|
use iter::{Filter, Map, Iterator};
|
||||||
use iter::{Rev, DoubleEndedIterator, ExactSize};
|
use iter::{Rev, DoubleEndedIterator, ExactSize};
|
||||||
|
use iter::range;
|
||||||
use num::Saturating;
|
use num::Saturating;
|
||||||
use option::{None, Option, Some};
|
use option::{None, Option, Some};
|
||||||
use raw::Repr;
|
use raw::Repr;
|
||||||
use slice::{ImmutableVector, Vector};
|
use slice::{ImmutableVector, Vector};
|
||||||
use slice;
|
use slice;
|
||||||
|
use uint;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Section: Creating a string
|
Section: Creating a string
|
||||||
|
@ -316,13 +319,207 @@ impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The internal state of an iterator that searches for matches of a substring
|
||||||
|
/// within a larger string using naive search
|
||||||
|
#[deriving(Clone)]
|
||||||
|
struct NaiveSearcher {
|
||||||
|
position: uint
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NaiveSearcher {
|
||||||
|
fn new() -> NaiveSearcher {
|
||||||
|
NaiveSearcher { position: 0 }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn next(&mut self, haystack: &[u8], needle: &[u8]) -> Option<(uint, uint)> {
|
||||||
|
while self.position + needle.len() <= haystack.len() {
|
||||||
|
if haystack.slice(self.position, self.position + needle.len()) == needle {
|
||||||
|
let matchPos = self.position;
|
||||||
|
self.position += needle.len(); // add 1 for all matches
|
||||||
|
return Some((matchPos, matchPos + needle.len()));
|
||||||
|
} else {
|
||||||
|
self.position += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The internal state of an iterator that searches for matches of a substring
|
||||||
|
/// within a larger string using two-way search
|
||||||
|
#[deriving(Clone)]
|
||||||
|
struct TwoWaySearcher {
|
||||||
|
// constants
|
||||||
|
critPos: uint,
|
||||||
|
period: uint,
|
||||||
|
byteset: u64,
|
||||||
|
|
||||||
|
// variables
|
||||||
|
position: uint,
|
||||||
|
memory: uint
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TwoWaySearcher {
|
||||||
|
fn new(needle: &[u8]) -> TwoWaySearcher {
|
||||||
|
let (critPos1, period1) = TwoWaySearcher::maximal_suffix(needle, false);
|
||||||
|
let (critPos2, period2) = TwoWaySearcher::maximal_suffix(needle, true);
|
||||||
|
|
||||||
|
let critPos;
|
||||||
|
let period;
|
||||||
|
if critPos1 > critPos2 {
|
||||||
|
critPos = critPos1;
|
||||||
|
period = period1;
|
||||||
|
} else {
|
||||||
|
critPos = critPos2;
|
||||||
|
period = period2;
|
||||||
|
}
|
||||||
|
|
||||||
|
let byteset = needle.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a);
|
||||||
|
|
||||||
|
if needle.slice_to(critPos) == needle.slice_from(needle.len() - critPos) {
|
||||||
|
TwoWaySearcher {
|
||||||
|
critPos: critPos,
|
||||||
|
period: period,
|
||||||
|
byteset: byteset,
|
||||||
|
|
||||||
|
position: 0,
|
||||||
|
memory: 0
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
TwoWaySearcher {
|
||||||
|
critPos: critPos,
|
||||||
|
period: cmp::max(critPos, needle.len() - critPos) + 1,
|
||||||
|
byteset: byteset,
|
||||||
|
|
||||||
|
position: 0,
|
||||||
|
memory: uint::MAX // Dummy value to signify that the period is long
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn next(&mut self, haystack: &[u8], needle: &[u8], longPeriod: bool) -> Option<(uint, uint)> {
|
||||||
|
'search: loop {
|
||||||
|
// Check that we have room to search in
|
||||||
|
if self.position + needle.len() > haystack.len() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Quickly skip by large portions unrelated to our substring
|
||||||
|
if (self.byteset >> (haystack[self.position + needle.len() - 1] & 0x3f)) & 1 == 0 {
|
||||||
|
self.position += needle.len();
|
||||||
|
continue 'search;
|
||||||
|
}
|
||||||
|
|
||||||
|
// See if the right part of the needle matches
|
||||||
|
let start = if longPeriod { self.critPos } else { cmp::max(self.critPos, self.memory) };
|
||||||
|
for i in range(start, needle.len()) {
|
||||||
|
if needle[i] != haystack[self.position + i] {
|
||||||
|
self.position += i - self.critPos + 1;
|
||||||
|
if !longPeriod {
|
||||||
|
self.memory = 0;
|
||||||
|
}
|
||||||
|
continue 'search;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// See if the left part of the needle matches
|
||||||
|
let start = if longPeriod { 0 } else { self.memory };
|
||||||
|
for i in range(start, self.critPos).rev() {
|
||||||
|
if needle[i] != haystack[self.position + i] {
|
||||||
|
self.position += self.period;
|
||||||
|
if !longPeriod {
|
||||||
|
self.memory = needle.len() - self.period;
|
||||||
|
}
|
||||||
|
continue 'search;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We have found a match!
|
||||||
|
let matchPos = self.position;
|
||||||
|
self.position += needle.len(); // add self.period for all matches
|
||||||
|
if !longPeriod {
|
||||||
|
self.memory = 0; // set to needle.len() - self.period for all matches
|
||||||
|
}
|
||||||
|
return Some((matchPos, matchPos + needle.len()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn maximal_suffix(arr: &[u8], reversed: bool) -> (uint, uint) {
|
||||||
|
let mut left = -1; // Corresponds to i in the paper
|
||||||
|
let mut right = 0; // Corresponds to j in the paper
|
||||||
|
let mut offset = 1; // Corresponds to k in the paper
|
||||||
|
let mut period = 1; // Corresponds to p in the paper
|
||||||
|
|
||||||
|
while right + offset < arr.len() {
|
||||||
|
let a;
|
||||||
|
let b;
|
||||||
|
if reversed {
|
||||||
|
a = arr[left + offset];
|
||||||
|
b = arr[right + offset];
|
||||||
|
} else {
|
||||||
|
a = arr[right + offset];
|
||||||
|
b = arr[left + offset];
|
||||||
|
}
|
||||||
|
if a < b {
|
||||||
|
// Suffix is smaller, period is entire prefix so far.
|
||||||
|
right += offset;
|
||||||
|
offset = 1;
|
||||||
|
period = right - left;
|
||||||
|
} else if a == b {
|
||||||
|
// Advance through repetition of the current period.
|
||||||
|
if offset == period {
|
||||||
|
right += offset;
|
||||||
|
offset = 1;
|
||||||
|
} else {
|
||||||
|
offset += 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Suffix is larger, start over from current location.
|
||||||
|
left = right;
|
||||||
|
right += 1;
|
||||||
|
offset = 1;
|
||||||
|
period = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
(left + 1, period)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The internal state of an iterator that searches for matches of a substring
|
||||||
|
/// within a larger string using a dynamically chosed search algorithm
|
||||||
|
#[deriving(Clone)]
|
||||||
|
enum Searcher {
|
||||||
|
Naive(NaiveSearcher),
|
||||||
|
TwoWay(TwoWaySearcher),
|
||||||
|
TwoWayLong(TwoWaySearcher)
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Searcher {
|
||||||
|
fn new(haystack: &[u8], needle: &[u8]) -> Searcher {
|
||||||
|
// FIXME: Tune this.
|
||||||
|
if needle.len() > haystack.len() - 20 {
|
||||||
|
Naive(NaiveSearcher::new())
|
||||||
|
} else {
|
||||||
|
let searcher = TwoWaySearcher::new(needle);
|
||||||
|
if searcher.memory == uint::MAX { // If the period is long
|
||||||
|
TwoWayLong(searcher)
|
||||||
|
} else {
|
||||||
|
TwoWay(searcher)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// An iterator over the start and end indices of the matches of a
|
/// An iterator over the start and end indices of the matches of a
|
||||||
/// substring within a larger string
|
/// substring within a larger string
|
||||||
#[deriving(Clone)]
|
#[deriving(Clone)]
|
||||||
pub struct MatchIndices<'a> {
|
pub struct MatchIndices<'a> {
|
||||||
|
// constants
|
||||||
haystack: &'a str,
|
haystack: &'a str,
|
||||||
needle: &'a str,
|
needle: &'a str,
|
||||||
position: uint,
|
searcher: Searcher
|
||||||
}
|
}
|
||||||
|
|
||||||
/// An iterator over the substrings of a string separated by a given
|
/// An iterator over the substrings of a string separated by a given
|
||||||
|
@ -337,31 +534,14 @@ pub struct StrSplits<'a> {
|
||||||
impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> {
|
impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn next(&mut self) -> Option<(uint, uint)> {
|
fn next(&mut self) -> Option<(uint, uint)> {
|
||||||
// See Issue #1932 for why this is a naive search
|
match self.searcher {
|
||||||
let (h_len, n_len) = (self.haystack.len(), self.needle.len());
|
Naive(ref mut searcher)
|
||||||
let mut match_start = 0;
|
=> searcher.next(self.haystack.as_bytes(), self.needle.as_bytes()),
|
||||||
let mut match_i = 0;
|
TwoWay(ref mut searcher)
|
||||||
|
=> searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), false),
|
||||||
while self.position < h_len {
|
TwoWayLong(ref mut searcher)
|
||||||
if self.haystack[self.position] == self.needle[match_i] {
|
=> searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), true)
|
||||||
if match_i == 0 { match_start = self.position; }
|
|
||||||
match_i += 1;
|
|
||||||
self.position += 1;
|
|
||||||
|
|
||||||
if match_i == n_len {
|
|
||||||
// found a match!
|
|
||||||
return Some((match_start, self.position));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// failed match, backtrack
|
|
||||||
if match_i > 0 {
|
|
||||||
match_i = 0;
|
|
||||||
self.position = match_start;
|
|
||||||
}
|
|
||||||
self.position += 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
None
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1581,7 +1761,7 @@ impl<'a> StrSlice<'a> for &'a str {
|
||||||
MatchIndices {
|
MatchIndices {
|
||||||
haystack: *self,
|
haystack: *self,
|
||||||
needle: sep,
|
needle: sep,
|
||||||
position: 0
|
searcher: Searcher::new(self.as_bytes(), sep.as_bytes())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue