feat(issue search): query string for boolean operators and phrase search (#6952)
closes #6909 related to forgejo/design#14 # Description Adds the following boolean operators for issues when using an indexer (with minor caveats) - `+term`: `term` MUST be present for any result - `-term`: negation; exclude results that contain `term` - `"this is a term"`: matches the exact phrase `this is a term` In all cases the special characters may be escaped by the prefix `\` Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/6952 Reviewed-by: 0ko <0ko@noreply.codeberg.org> Reviewed-by: Otto <otto@codeberg.org> Co-authored-by: Shiny Nematoda <snematoda.751k2@aleeas.com> Co-committed-by: Shiny Nematoda <snematoda.751k2@aleeas.com>
This commit is contained in:
parent
eaa641c21e
commit
cddf608cb9
19 changed files with 451 additions and 192 deletions
|
@ -156,25 +156,32 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
|
|||
var queries []query.Query
|
||||
|
||||
if options.Keyword != "" {
|
||||
if options.IsFuzzyKeyword {
|
||||
fuzziness := 1
|
||||
if kl := len(options.Keyword); kl > 3 {
|
||||
fuzziness = 2
|
||||
} else if kl < 2 {
|
||||
fuzziness = 0
|
||||
}
|
||||
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
|
||||
inner_bleve.MatchQuery(options.Keyword, "title", issueIndexerAnalyzer, fuzziness),
|
||||
inner_bleve.MatchQuery(options.Keyword, "content", issueIndexerAnalyzer, fuzziness),
|
||||
inner_bleve.MatchQuery(options.Keyword, "comments", issueIndexerAnalyzer, fuzziness),
|
||||
}...))
|
||||
} else {
|
||||
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
|
||||
inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer, 0),
|
||||
inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer, 0),
|
||||
inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer, 0),
|
||||
}...))
|
||||
tokens, err := options.Tokens()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
q := bleve.NewBooleanQuery()
|
||||
for _, token := range tokens {
|
||||
fuzziness := 0
|
||||
if token.Fuzzy {
|
||||
// TODO: replace with "auto" after bleve update
|
||||
fuzziness = min(len(token.Term)/4, 2)
|
||||
}
|
||||
innerQ := bleve.NewDisjunctionQuery(
|
||||
inner_bleve.MatchPhraseQuery(token.Term, "title", issueIndexerAnalyzer, fuzziness),
|
||||
inner_bleve.MatchPhraseQuery(token.Term, "content", issueIndexerAnalyzer, fuzziness),
|
||||
inner_bleve.MatchPhraseQuery(token.Term, "comments", issueIndexerAnalyzer, fuzziness))
|
||||
|
||||
switch token.Kind {
|
||||
case internal.BoolOptMust:
|
||||
q.AddMust(innerQ)
|
||||
case internal.BoolOptShould:
|
||||
q.AddShould(innerQ)
|
||||
case internal.BoolOptNot:
|
||||
q.AddMustNot(innerQ)
|
||||
}
|
||||
}
|
||||
queries = append(queries, q)
|
||||
}
|
||||
|
||||
if len(options.RepoIDs) > 0 || options.AllPublic {
|
||||
|
|
|
@ -23,6 +23,10 @@ const (
|
|||
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
|
||||
esMultiMatchTypeBestFields = "best_fields"
|
||||
esMultiMatchTypePhrasePrefix = "phrase_prefix"
|
||||
|
||||
// fuzziness options
|
||||
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/common-options.html#fuzziness
|
||||
esFuzzyAuto = "AUTO"
|
||||
)
|
||||
|
||||
var _ internal.Indexer = &Indexer{}
|
||||
|
@ -145,12 +149,30 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
|
|||
query := elastic.NewBoolQuery()
|
||||
|
||||
if options.Keyword != "" {
|
||||
searchType := esMultiMatchTypePhrasePrefix
|
||||
if options.IsFuzzyKeyword {
|
||||
searchType = esMultiMatchTypeBestFields
|
||||
q := elastic.NewBoolQuery()
|
||||
tokens, err := options.Tokens()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for _, token := range tokens {
|
||||
innerQ := elastic.NewMultiMatchQuery(token.Term, "title", "content", "comments")
|
||||
if token.Fuzzy {
|
||||
// If the term is not a phrase use fuzziness set to AUTO
|
||||
innerQ = innerQ.Type(esMultiMatchTypeBestFields).Fuzziness(esFuzzyAuto)
|
||||
} else {
|
||||
innerQ = innerQ.Type(esMultiMatchTypePhrasePrefix)
|
||||
}
|
||||
|
||||
query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments").Type(searchType))
|
||||
switch token.Kind {
|
||||
case internal.BoolOptMust:
|
||||
q.Must(innerQ)
|
||||
case internal.BoolOptShould:
|
||||
q.Should(innerQ)
|
||||
case internal.BoolOptNot:
|
||||
q.MustNot(innerQ)
|
||||
}
|
||||
}
|
||||
query.Must(q)
|
||||
}
|
||||
|
||||
if len(options.RepoIDs) > 0 {
|
||||
|
|
|
@ -74,8 +74,6 @@ type SearchResult struct {
|
|||
type SearchOptions struct {
|
||||
Keyword string // keyword to search
|
||||
|
||||
IsFuzzyKeyword bool // if false the levenshtein distance is 0
|
||||
|
||||
RepoIDs []int64 // repository IDs which the issues belong to
|
||||
AllPublic bool // if include all public repositories
|
||||
|
||||
|
|
112
modules/indexer/issues/internal/qstring.go
Normal file
112
modules/indexer/issues/internal/qstring.go
Normal file
|
@ -0,0 +1,112 @@
|
|||
// Copyright 2025 The Forgejo Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package internal
|
||||
|
||||
import (
|
||||
"io"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type BoolOpt int
|
||||
|
||||
const (
|
||||
BoolOptMust BoolOpt = iota
|
||||
BoolOptShould
|
||||
BoolOptNot
|
||||
)
|
||||
|
||||
type Token struct {
|
||||
Term string
|
||||
Kind BoolOpt
|
||||
Fuzzy bool
|
||||
}
|
||||
|
||||
type Tokenizer struct {
|
||||
in *strings.Reader
|
||||
}
|
||||
|
||||
func (t *Tokenizer) next() (tk Token, err error) {
|
||||
var (
|
||||
sb strings.Builder
|
||||
r rune
|
||||
)
|
||||
tk.Kind = BoolOptShould
|
||||
tk.Fuzzy = true
|
||||
|
||||
// skip all leading white space
|
||||
for {
|
||||
if r, _, err = t.in.ReadRune(); err == nil && r == ' ' {
|
||||
//nolint:staticcheck,wastedassign // SA4006 the variable is used after the loop
|
||||
r, _, err = t.in.ReadRune()
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return tk, err
|
||||
}
|
||||
|
||||
// check for +/- op, increment to the next rune in both cases
|
||||
switch r {
|
||||
case '+':
|
||||
tk.Kind = BoolOptMust
|
||||
r, _, err = t.in.ReadRune()
|
||||
case '-':
|
||||
tk.Kind = BoolOptNot
|
||||
r, _, err = t.in.ReadRune()
|
||||
}
|
||||
if err != nil {
|
||||
return tk, err
|
||||
}
|
||||
|
||||
// parse the string, escaping special characters
|
||||
for esc := false; err == nil; r, _, err = t.in.ReadRune() {
|
||||
if esc {
|
||||
if !strings.ContainsRune("+-\\\"", r) {
|
||||
sb.WriteRune('\\')
|
||||
}
|
||||
sb.WriteRune(r)
|
||||
esc = false
|
||||
continue
|
||||
}
|
||||
switch r {
|
||||
case '\\':
|
||||
esc = true
|
||||
case '"':
|
||||
if !tk.Fuzzy {
|
||||
goto nextEnd
|
||||
}
|
||||
tk.Fuzzy = false
|
||||
case ' ', '\t':
|
||||
if tk.Fuzzy {
|
||||
goto nextEnd
|
||||
}
|
||||
sb.WriteRune(r)
|
||||
default:
|
||||
sb.WriteRune(r)
|
||||
}
|
||||
}
|
||||
nextEnd:
|
||||
|
||||
tk.Term = sb.String()
|
||||
if err == io.EOF {
|
||||
err = nil
|
||||
} // do not consider EOF as an error at the end
|
||||
return tk, err
|
||||
}
|
||||
|
||||
// Tokenize the keyword
|
||||
func (o *SearchOptions) Tokens() (tokens []Token, err error) {
|
||||
in := strings.NewReader(o.Keyword)
|
||||
it := Tokenizer{in: in}
|
||||
|
||||
for token, err := it.next(); err == nil; token, err = it.next() {
|
||||
tokens = append(tokens, token)
|
||||
}
|
||||
if err != nil && err != io.EOF {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return tokens, nil
|
||||
}
|
171
modules/indexer/issues/internal/qstring_test.go
Normal file
171
modules/indexer/issues/internal/qstring_test.go
Normal file
|
@ -0,0 +1,171 @@
|
|||
// Copyright 2025 The Forgejo Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package internal
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
type testIssueQueryStringOpt struct {
|
||||
Keyword string
|
||||
Results []Token
|
||||
}
|
||||
|
||||
var testOpts = []testIssueQueryStringOpt{
|
||||
{
|
||||
Keyword: "Hello",
|
||||
Results: []Token{
|
||||
{
|
||||
Term: "Hello",
|
||||
Fuzzy: true,
|
||||
Kind: BoolOptShould,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Keyword: "Hello World",
|
||||
Results: []Token{
|
||||
{
|
||||
Term: "Hello",
|
||||
Fuzzy: true,
|
||||
Kind: BoolOptShould,
|
||||
},
|
||||
{
|
||||
Term: "World",
|
||||
Fuzzy: true,
|
||||
Kind: BoolOptShould,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Keyword: "+Hello +World",
|
||||
Results: []Token{
|
||||
{
|
||||
Term: "Hello",
|
||||
Fuzzy: true,
|
||||
Kind: BoolOptMust,
|
||||
},
|
||||
{
|
||||
Term: "World",
|
||||
Fuzzy: true,
|
||||
Kind: BoolOptMust,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Keyword: "+Hello World",
|
||||
Results: []Token{
|
||||
{
|
||||
Term: "Hello",
|
||||
Fuzzy: true,
|
||||
Kind: BoolOptMust,
|
||||
},
|
||||
{
|
||||
Term: "World",
|
||||
Fuzzy: true,
|
||||
Kind: BoolOptShould,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Keyword: "+Hello -World",
|
||||
Results: []Token{
|
||||
{
|
||||
Term: "Hello",
|
||||
Fuzzy: true,
|
||||
Kind: BoolOptMust,
|
||||
},
|
||||
{
|
||||
Term: "World",
|
||||
Fuzzy: true,
|
||||
Kind: BoolOptNot,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Keyword: "\"Hello World\"",
|
||||
Results: []Token{
|
||||
{
|
||||
Term: "Hello World",
|
||||
Fuzzy: false,
|
||||
Kind: BoolOptShould,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Keyword: "+\"Hello World\"",
|
||||
Results: []Token{
|
||||
{
|
||||
Term: "Hello World",
|
||||
Fuzzy: false,
|
||||
Kind: BoolOptMust,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Keyword: "-\"Hello World\"",
|
||||
Results: []Token{
|
||||
{
|
||||
Term: "Hello World",
|
||||
Fuzzy: false,
|
||||
Kind: BoolOptNot,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Keyword: "\"+Hello -World\"",
|
||||
Results: []Token{
|
||||
{
|
||||
Term: "+Hello -World",
|
||||
Fuzzy: false,
|
||||
Kind: BoolOptShould,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Keyword: "\\+Hello", // \+Hello => +Hello
|
||||
Results: []Token{
|
||||
{
|
||||
Term: "+Hello",
|
||||
Fuzzy: true,
|
||||
Kind: BoolOptShould,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Keyword: "\\\\Hello", // \\Hello => \Hello
|
||||
Results: []Token{
|
||||
{
|
||||
Term: "\\Hello",
|
||||
Fuzzy: true,
|
||||
Kind: BoolOptShould,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Keyword: "\\\"Hello", // \"Hello => "Hello
|
||||
Results: []Token{
|
||||
{
|
||||
Term: "\"Hello",
|
||||
Fuzzy: true,
|
||||
Kind: BoolOptShould,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
func TestIssueQueryString(t *testing.T) {
|
||||
var opt SearchOptions
|
||||
for _, res := range testOpts {
|
||||
t.Run(opt.Keyword, func(t *testing.T) {
|
||||
opt.Keyword = res.Keyword
|
||||
tokens, err := opt.Tokens()
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, res.Results, tokens)
|
||||
})
|
||||
}
|
||||
}
|
|
@ -131,6 +131,20 @@ var cases = []*testIndexerCase{
|
|||
ExpectedIDs: []int64{1002, 1001, 1000},
|
||||
ExpectedTotal: 3,
|
||||
},
|
||||
{
|
||||
Name: "Keyword Exclude",
|
||||
ExtraData: []*internal.IndexerData{
|
||||
{ID: 1000, Title: "hi hello world"},
|
||||
{ID: 1001, Content: "hi hello world"},
|
||||
{ID: 1002, Comments: []string{"hello", "hello world"}},
|
||||
},
|
||||
SearchOptions: &internal.SearchOptions{
|
||||
Keyword: "hello world -hi",
|
||||
SortBy: internal.SortByCreatedDesc,
|
||||
},
|
||||
ExpectedIDs: []int64{1002},
|
||||
ExpectedTotal: 1,
|
||||
},
|
||||
{
|
||||
Name: "Keyword Fuzzy",
|
||||
ExtraData: []*internal.IndexerData{
|
||||
|
@ -139,9 +153,8 @@ var cases = []*testIndexerCase{
|
|||
{ID: 1002, Comments: []string{"hi", "hello world"}},
|
||||
},
|
||||
SearchOptions: &internal.SearchOptions{
|
||||
Keyword: "hello world",
|
||||
SortBy: internal.SortByCreatedDesc,
|
||||
IsFuzzyKeyword: true,
|
||||
Keyword: "hello world",
|
||||
SortBy: internal.SortByCreatedDesc,
|
||||
},
|
||||
ExpectedIDs: []int64{1002, 1001, 1000},
|
||||
ExpectedTotal: 3,
|
||||
|
|
|
@ -232,20 +232,36 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
|
|||
limit = 1
|
||||
}
|
||||
|
||||
keyword := options.Keyword
|
||||
if !options.IsFuzzyKeyword {
|
||||
// to make it non fuzzy ("typo tolerance" in meilisearch terms), we have to quote the keyword(s)
|
||||
// https://www.meilisearch.com/docs/reference/api/search#phrase-search
|
||||
keyword = doubleQuoteKeyword(keyword)
|
||||
var keywords []string
|
||||
if options.Keyword != "" {
|
||||
tokens, err := options.Tokens()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for _, token := range tokens {
|
||||
if !token.Fuzzy {
|
||||
// to make it a phrase search, we have to quote the keyword(s)
|
||||
// https://www.meilisearch.com/docs/reference/api/search#phrase-search
|
||||
token.Term = doubleQuoteKeyword(token.Term)
|
||||
}
|
||||
|
||||
// internal.BoolOptShould (Default, requires no modifications)
|
||||
// internal.BoolOptMust (Not supported by meilisearch)
|
||||
if token.Kind == internal.BoolOptNot {
|
||||
token.Term = "-" + token.Term
|
||||
}
|
||||
keywords = append(keywords, token.Term)
|
||||
}
|
||||
}
|
||||
|
||||
searchRes, err := b.inner.Client.Index(b.inner.VersionedIndexName()).Search(keyword, &meilisearch.SearchRequest{
|
||||
Filter: query.Statement(),
|
||||
Limit: int64(limit),
|
||||
Offset: int64(skip),
|
||||
Sort: sortBy,
|
||||
MatchingStrategy: meilisearch.All,
|
||||
})
|
||||
searchRes, err := b.inner.Client.Index(b.inner.VersionedIndexName()).
|
||||
Search(strings.Join(keywords, " "), &meilisearch.SearchRequest{
|
||||
Filter: query.Statement(),
|
||||
Limit: int64(limit),
|
||||
Offset: int64(skip),
|
||||
Sort: sortBy,
|
||||
MatchingStrategy: meilisearch.All,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue