Rollup merge of #79000 - sivadeilra:user/ardavis/lev_distance, r=wesleywiser

Move lev_distance to rustc_ast, make non-generic

rustc_ast currently has a few dependencies on rustc_lexer. Ideally, an AST
would not have any dependency its lexer, for minimizing
design-time dependencies. Breaking this dependency would also have practical
benefits, since modifying rustc_lexer would not trigger a rebuild of rustc_ast.

This commit does not remove the rustc_ast --> rustc_lexer dependency,
but it does remove one of the sources of this dependency, which is the
code that handles fuzzy matching between symbol names for making suggestions
in diagnostics. Since that code depends only on Symbol, it is easy to move
it to rustc_span. It might even be best to move it to a separate crate,
since other tools such as Cargo use the same algorithm, and have simply
contain a duplicate of the code.

This changes the signature of find_best_match_for_name so that it is no
longer generic over its input. I checked the optimized binaries, and this
function was duplicated for nearly every call site, because most call sites
used short-lived iterator chains, generic over Map and such. But there's
no good reason for a function like this to be generic, since all it does
is immediately convert the generic input (the Iterator impl) to a concrete
Vec<Symbol>. This has all of the costs of generics (duplicated method bodies)
with no benefit.

Changing find_best_match_for_name to be non-generic removed about 10KB of
code from the optimized binary. I know it's a drop in the bucket, but we have
to start reducing binary size, and beginning to tame over-use of generics
is part of that.
This commit is contained in:
Jonas Schievink 2020-11-26 13:39:05 +01:00 committed by GitHub
commit 6fcd589025
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 96 additions and 84 deletions

View file

@ -34,7 +34,6 @@ macro_rules! unwrap_or {
pub mod util {
pub mod classify;
pub mod comments;
pub mod lev_distance;
pub mod literal;
pub mod parser;
}

View file

@ -1,109 +0,0 @@
// FIXME(Centril): Move to rustc_span?
use rustc_span::symbol::Symbol;
use std::cmp;
#[cfg(test)]
mod tests;
/// Finds the Levenshtein distance between two strings
pub fn lev_distance(a: &str, b: &str) -> usize {
// cases which don't require further computation
if a.is_empty() {
return b.chars().count();
} else if b.is_empty() {
return a.chars().count();
}
let mut dcol: Vec<_> = (0..=b.len()).collect();
let mut t_last = 0;
for (i, sc) in a.chars().enumerate() {
let mut current = i;
dcol[0] = current + 1;
for (j, tc) in b.chars().enumerate() {
let next = dcol[j + 1];
if sc == tc {
dcol[j + 1] = current;
} else {
dcol[j + 1] = cmp::min(current, next);
dcol[j + 1] = cmp::min(dcol[j + 1], dcol[j]) + 1;
}
current = next;
t_last = j;
}
}
dcol[t_last + 1]
}
/// Finds the best match for a given word in the given iterator
///
/// As a loose rule to avoid the obviously incorrect suggestions, it takes
/// an optional limit for the maximum allowable edit distance, which defaults
/// to one-third of the given word.
///
/// Besides Levenshtein, we use case insensitive comparison to improve accuracy on an edge case with
/// a lower(upper)case letters mismatch.
pub fn find_best_match_for_name<'a, T>(
iter_names: T,
lookup: Symbol,
dist: Option<usize>,
) -> Option<Symbol>
where
T: Iterator<Item = &'a Symbol>,
{
let lookup = &lookup.as_str();
let max_dist = dist.unwrap_or_else(|| cmp::max(lookup.len(), 3) / 3);
let name_vec: Vec<&Symbol> = iter_names.collect();
let (case_insensitive_match, levenshtein_match) = name_vec
.iter()
.filter_map(|&name| {
let dist = lev_distance(lookup, &name.as_str());
if dist <= max_dist { Some((name, dist)) } else { None }
})
// Here we are collecting the next structure:
// (case_insensitive_match, (levenshtein_match, levenshtein_distance))
.fold((None, None), |result, (candidate, dist)| {
(
if candidate.as_str().to_uppercase() == lookup.to_uppercase() {
Some(candidate)
} else {
result.0
},
match result.1 {
None => Some((candidate, dist)),
Some((c, d)) => Some(if dist < d { (candidate, dist) } else { (c, d) }),
},
)
});
// Priority of matches:
// 1. Exact case insensitive match
// 2. Levenshtein distance match
// 3. Sorted word match
if let Some(candidate) = case_insensitive_match {
Some(*candidate)
} else if levenshtein_match.is_some() {
levenshtein_match.map(|(candidate, _)| *candidate)
} else {
find_match_by_sorted_words(name_vec, lookup)
}
}
fn find_match_by_sorted_words<'a>(iter_names: Vec<&'a Symbol>, lookup: &str) -> Option<Symbol> {
iter_names.iter().fold(None, |result, candidate| {
if sort_by_words(&candidate.as_str()) == sort_by_words(lookup) {
Some(**candidate)
} else {
result
}
})
}
fn sort_by_words(name: &str) -> String {
let mut split_words: Vec<&str> = name.split('_').collect();
// We are sorting primitive &strs and can use unstable sort here
split_words.sort_unstable();
split_words.join("_")
}

View file

@ -1,59 +0,0 @@
use super::*;
#[test]
fn test_lev_distance() {
use std::char::{from_u32, MAX};
// Test bytelength agnosticity
for c in (0..MAX as u32).filter_map(|i| from_u32(i)).map(|i| i.to_string()) {
assert_eq!(lev_distance(&c[..], &c[..]), 0);
}
let a = "\nMäry häd ä little lämb\n\nLittle lämb\n";
let b = "\nMary häd ä little lämb\n\nLittle lämb\n";
let c = "Mary häd ä little lämb\n\nLittle lämb\n";
assert_eq!(lev_distance(a, b), 1);
assert_eq!(lev_distance(b, a), 1);
assert_eq!(lev_distance(a, c), 2);
assert_eq!(lev_distance(c, a), 2);
assert_eq!(lev_distance(b, c), 1);
assert_eq!(lev_distance(c, b), 1);
}
#[test]
fn test_find_best_match_for_name() {
use rustc_span::with_default_session_globals;
with_default_session_globals(|| {
let input = vec![Symbol::intern("aaab"), Symbol::intern("aaabc")];
assert_eq!(
find_best_match_for_name(input.iter(), Symbol::intern("aaaa"), None),
Some(Symbol::intern("aaab"))
);
assert_eq!(
find_best_match_for_name(input.iter(), Symbol::intern("1111111111"), None),
None
);
let input = vec![Symbol::intern("aAAA")];
assert_eq!(
find_best_match_for_name(input.iter(), Symbol::intern("AAAA"), None),
Some(Symbol::intern("aAAA"))
);
let input = vec![Symbol::intern("AAAA")];
// Returns None because `lev_distance > max_dist / 3`
assert_eq!(find_best_match_for_name(input.iter(), Symbol::intern("aaaa"), None), None);
let input = vec![Symbol::intern("AAAA")];
assert_eq!(
find_best_match_for_name(input.iter(), Symbol::intern("aaaa"), Some(4)),
Some(Symbol::intern("AAAA"))
);
let input = vec![Symbol::intern("a_longer_variable_name")];
assert_eq!(
find_best_match_for_name(input.iter(), Symbol::intern("a_variable_longer_name"), None),
Some(Symbol::intern("a_longer_variable_name"))
);
})
}