Use restricted Damerau-Levenshtein algorithm
This commit is contained in:
parent
231bcd131d
commit
ff052eec80
3 changed files with 75 additions and 24 deletions
|
@ -1,49 +1,97 @@
|
||||||
//! Levenshtein distances.
|
//! Damerau-Levenshtein distances.
|
||||||
//!
|
//!
|
||||||
//! The [Levenshtein distance] is a metric for measuring the difference between two strings.
|
//! The [Damerau-Levenshtein distance] is a metric for measuring the difference between two strings.
|
||||||
|
//! This implementation is a restricted version of the algorithm, as it does not permit modifying
|
||||||
|
//! characters that have already been transposed.
|
||||||
//!
|
//!
|
||||||
//! [Levenshtein distance]: https://en.wikipedia.org/wiki/Levenshtein_distance
|
//! [Damerau-Levenshtein distance]: https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
|
||||||
|
|
||||||
use crate::symbol::Symbol;
|
use crate::symbol::Symbol;
|
||||||
use std::cmp;
|
use std::{cmp, mem};
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests;
|
mod tests;
|
||||||
|
|
||||||
/// Finds the Levenshtein distance between two strings.
|
/// Finds the restricted Damerau-Levenshtein distance between two strings. Characters that have
|
||||||
|
/// already been transposed may not be modified.
|
||||||
///
|
///
|
||||||
/// Returns None if the distance exceeds the limit.
|
/// Returns None if the distance exceeds the limit.
|
||||||
pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
|
pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
|
||||||
let n = a.chars().count();
|
let mut a = &a.chars().collect::<Vec<_>>()[..];
|
||||||
let m = b.chars().count();
|
let mut b = &b.chars().collect::<Vec<_>>()[..];
|
||||||
let min_dist = if n < m { m - n } else { n - m };
|
|
||||||
|
|
||||||
|
// Ensure that `b` is the shorter string, minimizing memory use.
|
||||||
|
if a.len() < b.len() {
|
||||||
|
mem::swap(&mut a, &mut b);
|
||||||
|
}
|
||||||
|
|
||||||
|
let min_dist = a.len() - b.len();
|
||||||
|
// If we know the limit will be exceeded, we can return early.
|
||||||
if min_dist > limit {
|
if min_dist > limit {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
if n == 0 || m == 0 {
|
|
||||||
return (min_dist <= limit).then_some(min_dist);
|
// Strip common prefix.
|
||||||
|
while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_first().zip(a.split_first())
|
||||||
|
&& a_char == b_char
|
||||||
|
{
|
||||||
|
a = a_rest;
|
||||||
|
b = b_rest;
|
||||||
|
}
|
||||||
|
// Strip common suffix.
|
||||||
|
while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_last().zip(a.split_last())
|
||||||
|
&& a_char == b_char
|
||||||
|
{
|
||||||
|
a = a_rest;
|
||||||
|
b = b_rest;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut dcol: Vec<_> = (0..=m).collect();
|
// If either string is empty, the distance is the length of the other.
|
||||||
|
// We know that `b` is the shorter string, so we don't need to check `a`.
|
||||||
|
if b.len() == 0 {
|
||||||
|
return Some(min_dist);
|
||||||
|
}
|
||||||
|
|
||||||
for (i, sc) in a.chars().enumerate() {
|
let mut prev_prev = vec![usize::MAX; b.len() + 1];
|
||||||
let mut current = i;
|
let mut prev = (0..=b.len()).collect::<Vec<_>>();
|
||||||
dcol[0] = current + 1;
|
let mut current = vec![0; b.len() + 1];
|
||||||
|
|
||||||
for (j, tc) in b.chars().enumerate() {
|
// row by row
|
||||||
let next = dcol[j + 1];
|
for i in 1..=a.len() {
|
||||||
if sc == tc {
|
current[0] = i;
|
||||||
dcol[j + 1] = current;
|
let a_idx = i - 1;
|
||||||
} else {
|
|
||||||
dcol[j + 1] = cmp::min(current, next);
|
// column by column
|
||||||
dcol[j + 1] = cmp::min(dcol[j + 1], dcol[j]) + 1;
|
for j in 1..=b.len() {
|
||||||
|
let b_idx = j - 1;
|
||||||
|
|
||||||
|
// There is no cost to substitute a character with itself.
|
||||||
|
let substitution_cost = if a[a_idx] == b[b_idx] { 0 } else { 1 };
|
||||||
|
|
||||||
|
current[j] = cmp::min(
|
||||||
|
// deletion
|
||||||
|
prev[j] + 1,
|
||||||
|
cmp::min(
|
||||||
|
// insertion
|
||||||
|
current[j - 1] + 1,
|
||||||
|
// substitution
|
||||||
|
prev[j - 1] + substitution_cost,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
if (i > 1) && (j > 1) && (a[a_idx] == b[b_idx - 1]) && (a[a_idx - 1] == b[b_idx]) {
|
||||||
|
// transposition
|
||||||
|
current[j] = cmp::min(current[j], prev_prev[j - 2] + 1);
|
||||||
}
|
}
|
||||||
current = next;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Rotate the buffers, reusing the memory.
|
||||||
|
[prev_prev, prev, current] = [prev, current, prev_prev];
|
||||||
}
|
}
|
||||||
|
|
||||||
(dcol[m] <= limit).then_some(dcol[m])
|
// `prev` because we already rotated the buffers.
|
||||||
|
let distance = prev[b.len()];
|
||||||
|
(distance <= limit).then_some(distance)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Provides a word similarity score between two words that accounts for substrings being more
|
/// Provides a word similarity score between two words that accounts for substrings being more
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#![feature(negative_impls)]
|
#![feature(negative_impls)]
|
||||||
#![feature(min_specialization)]
|
#![feature(min_specialization)]
|
||||||
#![feature(rustc_attrs)]
|
#![feature(rustc_attrs)]
|
||||||
|
#![feature(let_chains)]
|
||||||
#![deny(rustc::untranslatable_diagnostic)]
|
#![deny(rustc::untranslatable_diagnostic)]
|
||||||
#![deny(rustc::diagnostic_outside_of_impl)]
|
#![deny(rustc::diagnostic_outside_of_impl)]
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,9 @@ warning: unexpected `cfg` condition value
|
||||||
--> $DIR/invalid-cfg-value.rs:7:7
|
--> $DIR/invalid-cfg-value.rs:7:7
|
||||||
|
|
|
|
||||||
LL | #[cfg(feature = "sedre")]
|
LL | #[cfg(feature = "sedre")]
|
||||||
| ^^^^^^^^^^^^^^^^^
|
| ^^^^^^^^^^-------
|
||||||
|
| |
|
||||||
|
| help: did you mean: `"serde"`
|
||||||
|
|
|
|
||||||
= note: expected values for `feature` are: full, serde
|
= note: expected values for `feature` are: full, serde
|
||||||
= note: `#[warn(unexpected_cfgs)]` on by default
|
= note: `#[warn(unexpected_cfgs)]` on by default
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue