feat(code search): replace fuzzy search with union search for indexer (#6947)

Fuzzy searching for code has been known to be problematic #5264 and in my personal opinion isn't very useful.

Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/6947
Reviewed-by: Gusted <gusted@noreply.codeberg.org>
Co-authored-by: Shiny Nematoda <snematoda.751k2@aleeas.com>
Co-committed-by: Shiny Nematoda <snematoda.751k2@aleeas.com>
This commit is contained in:
Shiny Nematoda 2025-03-11 21:22:51 +00:00 committed by Gusted
parent cb46a036aa
commit 3816db68aa
10 changed files with 105 additions and 86 deletions

View file

@ -28,10 +28,10 @@ type GrepResult struct {
HighlightedRanges [][3]int
}
type grepMode int
type GrepMode int
const (
FixedGrepMode grepMode = iota
FixedGrepMode GrepMode = iota
FixedAnyGrepMode
RegExpGrepMode
)
@ -43,7 +43,7 @@ type GrepOptions struct {
MaxResultLimit int
MatchesPerFile int // >= git 2.38
ContextLineNumber int
Mode grepMode
Mode GrepMode
Filename string
}

View file

@ -40,10 +40,6 @@ import (
const (
unicodeNormalizeName = "unicodeNormalize"
maxBatchSize = 16
// fuzzyDenominator determines the levenshtein distance per each character of a keyword
fuzzyDenominator = 4
// see https://github.com/blevesearch/bleve/issues/1563#issuecomment-786822311
maxFuzziness = 2
)
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
@ -260,12 +256,14 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
keywordQuery query.Query
)
phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
phraseQuery.FieldVal = "Content"
phraseQuery.Analyzer = repoIndexerAnalyzer
keywordQuery = phraseQuery
if opts.IsKeywordFuzzy {
phraseQuery.Fuzziness = min(maxFuzziness, len(opts.Keyword)/fuzzyDenominator)
if opts.Mode == internal.CodeSearchModeUnion {
query := bleve.NewDisjunctionQuery()
for _, field := range strings.Fields(opts.Keyword) {
query.AddQuery(inner_bleve.MatchPhraseQuery(field, "Content", repoIndexerAnalyzer, 0))
}
keywordQuery = query
} else {
keywordQuery = inner_bleve.MatchPhraseQuery(opts.Keyword, "Content", repoIndexerAnalyzer, 0)
}
if len(opts.RepoIDs) > 0 {
@ -325,13 +323,16 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
for i, hit := range result.Hits {
startIndex, endIndex := -1, -1
for _, locations := range hit.Locations["Content"] {
if startIndex != -1 && endIndex != -1 {
break
}
location := locations[0]
locationStart := int(location.Start)
locationEnd := int(location.End)
if startIndex < 0 || locationStart < startIndex {
startIndex = locationStart
}
if endIndex < 0 || locationEnd > endIndex {
if endIndex < 0 && locationEnd > endIndex {
endIndex = locationEnd
}
}

View file

@ -33,8 +33,8 @@ const (
esRepoIndexerLatestVersion = 2
// multi-match-types, currently only 2 types are used
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
esMultiMatchTypeBestFields = "best_fields"
esMultiMatchTypePhrasePrefix = "phrase_prefix"
esMultiMatchTypeBestFields = "best_fields"
esMultiMatchTypePhrase = "phrase"
)
var _ internal.Indexer = &Indexer{}
@ -334,8 +334,8 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan
// Search searches for codes and language stats by given conditions.
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
searchType := esMultiMatchTypePhrasePrefix
if opts.IsKeywordFuzzy {
searchType := esMultiMatchTypePhrase
if opts.Mode == internal.CodeSearchModeUnion {
searchType = esMultiMatchTypeBestFields
}

View file

@ -100,8 +100,8 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
Page: 1,
PageSize: 10,
},
Filename: kw.Filename,
IsKeywordFuzzy: true,
Filename: kw.Filename,
Mode: SearchModeUnion,
})
require.NoError(t, err)
assert.Len(t, kw.IDs, int(total))

View file

@ -20,13 +20,27 @@ type Indexer interface {
Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error)
}
type CodeSearchMode int
const (
CodeSearchModeExact CodeSearchMode = iota
CodeSearchModeUnion
)
func (mode CodeSearchMode) String() string {
if mode == CodeSearchModeUnion {
return "union"
}
return "exact"
}
type SearchOptions struct {
RepoIDs []int64
Keyword string
Language string
Filename string
IsKeywordFuzzy bool
Mode CodeSearchMode
db.Paginator
}

View file

@ -35,7 +35,14 @@ type SearchResultLanguages = internal.SearchResultLanguages
type SearchOptions = internal.SearchOptions
var CodeSearchOptions = [2]string{"exact", "fuzzy"}
var CodeSearchOptions = [2]string{"exact", "union"}
type SearchMode = internal.CodeSearchMode
const (
SearchModeExact = internal.CodeSearchModeExact
SearchModeUnion = internal.CodeSearchModeUnion
)
func indices(content string, selectionStartIndex, selectionEndIndex int) (int, int) {
startIndex := selectionStartIndex
@ -206,7 +213,6 @@ func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Res
}
// PerformSearch perform a search on a repository
// if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2
func PerformSearch(ctx context.Context, opts *SearchOptions) (int, []*Result, []*SearchResultLanguages, error) {
if opts == nil || len(opts.Keyword) == 0 {
return 0, nil, nil, nil