feat(code search): replace fuzzy search with union search for indexer (#6947)
Fuzzy searching for code has been known to be problematic #5264 and in my personal opinion isn't very useful. Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/6947 Reviewed-by: Gusted <gusted@noreply.codeberg.org> Co-authored-by: Shiny Nematoda <snematoda.751k2@aleeas.com> Co-committed-by: Shiny Nematoda <snematoda.751k2@aleeas.com>
This commit is contained in:
parent
cb46a036aa
commit
3816db68aa
10 changed files with 105 additions and 86 deletions
|
@ -28,10 +28,10 @@ type GrepResult struct {
|
|||
HighlightedRanges [][3]int
|
||||
}
|
||||
|
||||
type grepMode int
|
||||
type GrepMode int
|
||||
|
||||
const (
|
||||
FixedGrepMode grepMode = iota
|
||||
FixedGrepMode GrepMode = iota
|
||||
FixedAnyGrepMode
|
||||
RegExpGrepMode
|
||||
)
|
||||
|
@ -43,7 +43,7 @@ type GrepOptions struct {
|
|||
MaxResultLimit int
|
||||
MatchesPerFile int // >= git 2.38
|
||||
ContextLineNumber int
|
||||
Mode grepMode
|
||||
Mode GrepMode
|
||||
Filename string
|
||||
}
|
||||
|
||||
|
|
|
@ -40,10 +40,6 @@ import (
|
|||
const (
|
||||
unicodeNormalizeName = "unicodeNormalize"
|
||||
maxBatchSize = 16
|
||||
// fuzzyDenominator determines the levenshtein distance per each character of a keyword
|
||||
fuzzyDenominator = 4
|
||||
// see https://github.com/blevesearch/bleve/issues/1563#issuecomment-786822311
|
||||
maxFuzziness = 2
|
||||
)
|
||||
|
||||
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
|
||||
|
@ -260,12 +256,14 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
|
|||
keywordQuery query.Query
|
||||
)
|
||||
|
||||
phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
|
||||
phraseQuery.FieldVal = "Content"
|
||||
phraseQuery.Analyzer = repoIndexerAnalyzer
|
||||
keywordQuery = phraseQuery
|
||||
if opts.IsKeywordFuzzy {
|
||||
phraseQuery.Fuzziness = min(maxFuzziness, len(opts.Keyword)/fuzzyDenominator)
|
||||
if opts.Mode == internal.CodeSearchModeUnion {
|
||||
query := bleve.NewDisjunctionQuery()
|
||||
for _, field := range strings.Fields(opts.Keyword) {
|
||||
query.AddQuery(inner_bleve.MatchPhraseQuery(field, "Content", repoIndexerAnalyzer, 0))
|
||||
}
|
||||
keywordQuery = query
|
||||
} else {
|
||||
keywordQuery = inner_bleve.MatchPhraseQuery(opts.Keyword, "Content", repoIndexerAnalyzer, 0)
|
||||
}
|
||||
|
||||
if len(opts.RepoIDs) > 0 {
|
||||
|
@ -325,13 +323,16 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
|
|||
for i, hit := range result.Hits {
|
||||
startIndex, endIndex := -1, -1
|
||||
for _, locations := range hit.Locations["Content"] {
|
||||
if startIndex != -1 && endIndex != -1 {
|
||||
break
|
||||
}
|
||||
location := locations[0]
|
||||
locationStart := int(location.Start)
|
||||
locationEnd := int(location.End)
|
||||
if startIndex < 0 || locationStart < startIndex {
|
||||
startIndex = locationStart
|
||||
}
|
||||
if endIndex < 0 || locationEnd > endIndex {
|
||||
if endIndex < 0 && locationEnd > endIndex {
|
||||
endIndex = locationEnd
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,8 +33,8 @@ const (
|
|||
esRepoIndexerLatestVersion = 2
|
||||
// multi-match-types, currently only 2 types are used
|
||||
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
|
||||
esMultiMatchTypeBestFields = "best_fields"
|
||||
esMultiMatchTypePhrasePrefix = "phrase_prefix"
|
||||
esMultiMatchTypeBestFields = "best_fields"
|
||||
esMultiMatchTypePhrase = "phrase"
|
||||
)
|
||||
|
||||
var _ internal.Indexer = &Indexer{}
|
||||
|
@ -334,8 +334,8 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan
|
|||
|
||||
// Search searches for codes and language stats by given conditions.
|
||||
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
|
||||
searchType := esMultiMatchTypePhrasePrefix
|
||||
if opts.IsKeywordFuzzy {
|
||||
searchType := esMultiMatchTypePhrase
|
||||
if opts.Mode == internal.CodeSearchModeUnion {
|
||||
searchType = esMultiMatchTypeBestFields
|
||||
}
|
||||
|
||||
|
|
|
@ -100,8 +100,8 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
|
|||
Page: 1,
|
||||
PageSize: 10,
|
||||
},
|
||||
Filename: kw.Filename,
|
||||
IsKeywordFuzzy: true,
|
||||
Filename: kw.Filename,
|
||||
Mode: SearchModeUnion,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
assert.Len(t, kw.IDs, int(total))
|
||||
|
|
|
@ -20,13 +20,27 @@ type Indexer interface {
|
|||
Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error)
|
||||
}
|
||||
|
||||
type CodeSearchMode int
|
||||
|
||||
const (
|
||||
CodeSearchModeExact CodeSearchMode = iota
|
||||
CodeSearchModeUnion
|
||||
)
|
||||
|
||||
func (mode CodeSearchMode) String() string {
|
||||
if mode == CodeSearchModeUnion {
|
||||
return "union"
|
||||
}
|
||||
return "exact"
|
||||
}
|
||||
|
||||
type SearchOptions struct {
|
||||
RepoIDs []int64
|
||||
Keyword string
|
||||
Language string
|
||||
Filename string
|
||||
|
||||
IsKeywordFuzzy bool
|
||||
Mode CodeSearchMode
|
||||
|
||||
db.Paginator
|
||||
}
|
||||
|
|
|
@ -35,7 +35,14 @@ type SearchResultLanguages = internal.SearchResultLanguages
|
|||
|
||||
type SearchOptions = internal.SearchOptions
|
||||
|
||||
var CodeSearchOptions = [2]string{"exact", "fuzzy"}
|
||||
var CodeSearchOptions = [2]string{"exact", "union"}
|
||||
|
||||
type SearchMode = internal.CodeSearchMode
|
||||
|
||||
const (
|
||||
SearchModeExact = internal.CodeSearchModeExact
|
||||
SearchModeUnion = internal.CodeSearchModeUnion
|
||||
)
|
||||
|
||||
func indices(content string, selectionStartIndex, selectionEndIndex int) (int, int) {
|
||||
startIndex := selectionStartIndex
|
||||
|
@ -206,7 +213,6 @@ func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Res
|
|||
}
|
||||
|
||||
// PerformSearch perform a search on a repository
|
||||
// if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2
|
||||
func PerformSearch(ctx context.Context, opts *SearchOptions) (int, []*Result, []*SearchResultLanguages, error) {
|
||||
if opts == nil || len(opts.Keyword) == 0 {
|
||||
return 0, nil, nil, nil
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue