feat(issue search): query string for boolean operators and phrase search (#6952)

closes #6909

related to forgejo/design#14

# Description

Adds the following boolean operators for issues when using an indexer (with minor caveats)

- `+term`: `term` MUST be present for any result
- `-term`: negation; exclude results that contain `term`
- `"this is a term"`: matches the exact phrase `this is a term`

In all cases the special characters may be escaped by the prefix `\`

Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/6952
Reviewed-by: 0ko <0ko@noreply.codeberg.org>
Reviewed-by: Otto <otto@codeberg.org>
Co-authored-by: Shiny Nematoda <snematoda.751k2@aleeas.com>
Co-committed-by: Shiny Nematoda <snematoda.751k2@aleeas.com>
This commit is contained in:
Shiny Nematoda 2025-02-23 08:35:35 +00:00 committed by Earl Warren
parent eaa641c21e
commit cddf608cb9
19 changed files with 451 additions and 192 deletions

View file

@ -156,25 +156,32 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
var queries []query.Query
if options.Keyword != "" {
if options.IsFuzzyKeyword {
fuzziness := 1
if kl := len(options.Keyword); kl > 3 {
fuzziness = 2
} else if kl < 2 {
fuzziness = 0
}
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
inner_bleve.MatchQuery(options.Keyword, "title", issueIndexerAnalyzer, fuzziness),
inner_bleve.MatchQuery(options.Keyword, "content", issueIndexerAnalyzer, fuzziness),
inner_bleve.MatchQuery(options.Keyword, "comments", issueIndexerAnalyzer, fuzziness),
}...))
} else {
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer, 0),
inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer, 0),
inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer, 0),
}...))
tokens, err := options.Tokens()
if err != nil {
return nil, err
}
q := bleve.NewBooleanQuery()
for _, token := range tokens {
fuzziness := 0
if token.Fuzzy {
// TODO: replace with "auto" after bleve update
fuzziness = min(len(token.Term)/4, 2)
}
innerQ := bleve.NewDisjunctionQuery(
inner_bleve.MatchPhraseQuery(token.Term, "title", issueIndexerAnalyzer, fuzziness),
inner_bleve.MatchPhraseQuery(token.Term, "content", issueIndexerAnalyzer, fuzziness),
inner_bleve.MatchPhraseQuery(token.Term, "comments", issueIndexerAnalyzer, fuzziness))
switch token.Kind {
case internal.BoolOptMust:
q.AddMust(innerQ)
case internal.BoolOptShould:
q.AddShould(innerQ)
case internal.BoolOptNot:
q.AddMustNot(innerQ)
}
}
queries = append(queries, q)
}
if len(options.RepoIDs) > 0 || options.AllPublic {

View file

@ -23,6 +23,10 @@ const (
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
esMultiMatchTypeBestFields = "best_fields"
esMultiMatchTypePhrasePrefix = "phrase_prefix"
// fuzziness options
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/common-options.html#fuzziness
esFuzzyAuto = "AUTO"
)
var _ internal.Indexer = &Indexer{}
@ -145,12 +149,30 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
query := elastic.NewBoolQuery()
if options.Keyword != "" {
searchType := esMultiMatchTypePhrasePrefix
if options.IsFuzzyKeyword {
searchType = esMultiMatchTypeBestFields
q := elastic.NewBoolQuery()
tokens, err := options.Tokens()
if err != nil {
return nil, err
}
for _, token := range tokens {
innerQ := elastic.NewMultiMatchQuery(token.Term, "title", "content", "comments")
if token.Fuzzy {
// If the term is not a phrase use fuzziness set to AUTO
innerQ = innerQ.Type(esMultiMatchTypeBestFields).Fuzziness(esFuzzyAuto)
} else {
innerQ = innerQ.Type(esMultiMatchTypePhrasePrefix)
}
query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments").Type(searchType))
switch token.Kind {
case internal.BoolOptMust:
q.Must(innerQ)
case internal.BoolOptShould:
q.Should(innerQ)
case internal.BoolOptNot:
q.MustNot(innerQ)
}
}
query.Must(q)
}
if len(options.RepoIDs) > 0 {

View file

@ -74,8 +74,6 @@ type SearchResult struct {
type SearchOptions struct {
Keyword string // keyword to search
IsFuzzyKeyword bool // if false the levenshtein distance is 0
RepoIDs []int64 // repository IDs which the issues belong to
AllPublic bool // if include all public repositories

View file

@ -0,0 +1,112 @@
// Copyright 2025 The Forgejo Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package internal
import (
"io"
"strings"
)
type BoolOpt int
const (
BoolOptMust BoolOpt = iota
BoolOptShould
BoolOptNot
)
type Token struct {
Term string
Kind BoolOpt
Fuzzy bool
}
type Tokenizer struct {
in *strings.Reader
}
func (t *Tokenizer) next() (tk Token, err error) {
var (
sb strings.Builder
r rune
)
tk.Kind = BoolOptShould
tk.Fuzzy = true
// skip all leading white space
for {
if r, _, err = t.in.ReadRune(); err == nil && r == ' ' {
//nolint:staticcheck,wastedassign // SA4006 the variable is used after the loop
r, _, err = t.in.ReadRune()
continue
}
break
}
if err != nil {
return tk, err
}
// check for +/- op, increment to the next rune in both cases
switch r {
case '+':
tk.Kind = BoolOptMust
r, _, err = t.in.ReadRune()
case '-':
tk.Kind = BoolOptNot
r, _, err = t.in.ReadRune()
}
if err != nil {
return tk, err
}
// parse the string, escaping special characters
for esc := false; err == nil; r, _, err = t.in.ReadRune() {
if esc {
if !strings.ContainsRune("+-\\\"", r) {
sb.WriteRune('\\')
}
sb.WriteRune(r)
esc = false
continue
}
switch r {
case '\\':
esc = true
case '"':
if !tk.Fuzzy {
goto nextEnd
}
tk.Fuzzy = false
case ' ', '\t':
if tk.Fuzzy {
goto nextEnd
}
sb.WriteRune(r)
default:
sb.WriteRune(r)
}
}
nextEnd:
tk.Term = sb.String()
if err == io.EOF {
err = nil
} // do not consider EOF as an error at the end
return tk, err
}
// Tokenize the keyword
func (o *SearchOptions) Tokens() (tokens []Token, err error) {
in := strings.NewReader(o.Keyword)
it := Tokenizer{in: in}
for token, err := it.next(); err == nil; token, err = it.next() {
tokens = append(tokens, token)
}
if err != nil && err != io.EOF {
return nil, err
}
return tokens, nil
}

View file

@ -0,0 +1,171 @@
// Copyright 2025 The Forgejo Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package internal
import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
type testIssueQueryStringOpt struct {
Keyword string
Results []Token
}
var testOpts = []testIssueQueryStringOpt{
{
Keyword: "Hello",
Results: []Token{
{
Term: "Hello",
Fuzzy: true,
Kind: BoolOptShould,
},
},
},
{
Keyword: "Hello World",
Results: []Token{
{
Term: "Hello",
Fuzzy: true,
Kind: BoolOptShould,
},
{
Term: "World",
Fuzzy: true,
Kind: BoolOptShould,
},
},
},
{
Keyword: "+Hello +World",
Results: []Token{
{
Term: "Hello",
Fuzzy: true,
Kind: BoolOptMust,
},
{
Term: "World",
Fuzzy: true,
Kind: BoolOptMust,
},
},
},
{
Keyword: "+Hello World",
Results: []Token{
{
Term: "Hello",
Fuzzy: true,
Kind: BoolOptMust,
},
{
Term: "World",
Fuzzy: true,
Kind: BoolOptShould,
},
},
},
{
Keyword: "+Hello -World",
Results: []Token{
{
Term: "Hello",
Fuzzy: true,
Kind: BoolOptMust,
},
{
Term: "World",
Fuzzy: true,
Kind: BoolOptNot,
},
},
},
{
Keyword: "\"Hello World\"",
Results: []Token{
{
Term: "Hello World",
Fuzzy: false,
Kind: BoolOptShould,
},
},
},
{
Keyword: "+\"Hello World\"",
Results: []Token{
{
Term: "Hello World",
Fuzzy: false,
Kind: BoolOptMust,
},
},
},
{
Keyword: "-\"Hello World\"",
Results: []Token{
{
Term: "Hello World",
Fuzzy: false,
Kind: BoolOptNot,
},
},
},
{
Keyword: "\"+Hello -World\"",
Results: []Token{
{
Term: "+Hello -World",
Fuzzy: false,
Kind: BoolOptShould,
},
},
},
{
Keyword: "\\+Hello", // \+Hello => +Hello
Results: []Token{
{
Term: "+Hello",
Fuzzy: true,
Kind: BoolOptShould,
},
},
},
{
Keyword: "\\\\Hello", // \\Hello => \Hello
Results: []Token{
{
Term: "\\Hello",
Fuzzy: true,
Kind: BoolOptShould,
},
},
},
{
Keyword: "\\\"Hello", // \"Hello => "Hello
Results: []Token{
{
Term: "\"Hello",
Fuzzy: true,
Kind: BoolOptShould,
},
},
},
}
func TestIssueQueryString(t *testing.T) {
var opt SearchOptions
for _, res := range testOpts {
t.Run(opt.Keyword, func(t *testing.T) {
opt.Keyword = res.Keyword
tokens, err := opt.Tokens()
require.NoError(t, err)
assert.Equal(t, res.Results, tokens)
})
}
}

View file

@ -131,6 +131,20 @@ var cases = []*testIndexerCase{
ExpectedIDs: []int64{1002, 1001, 1000},
ExpectedTotal: 3,
},
{
Name: "Keyword Exclude",
ExtraData: []*internal.IndexerData{
{ID: 1000, Title: "hi hello world"},
{ID: 1001, Content: "hi hello world"},
{ID: 1002, Comments: []string{"hello", "hello world"}},
},
SearchOptions: &internal.SearchOptions{
Keyword: "hello world -hi",
SortBy: internal.SortByCreatedDesc,
},
ExpectedIDs: []int64{1002},
ExpectedTotal: 1,
},
{
Name: "Keyword Fuzzy",
ExtraData: []*internal.IndexerData{
@ -139,9 +153,8 @@ var cases = []*testIndexerCase{
{ID: 1002, Comments: []string{"hi", "hello world"}},
},
SearchOptions: &internal.SearchOptions{
Keyword: "hello world",
SortBy: internal.SortByCreatedDesc,
IsFuzzyKeyword: true,
Keyword: "hello world",
SortBy: internal.SortByCreatedDesc,
},
ExpectedIDs: []int64{1002, 1001, 1000},
ExpectedTotal: 3,

View file

@ -232,20 +232,36 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
limit = 1
}
keyword := options.Keyword
if !options.IsFuzzyKeyword {
// to make it non fuzzy ("typo tolerance" in meilisearch terms), we have to quote the keyword(s)
// https://www.meilisearch.com/docs/reference/api/search#phrase-search
keyword = doubleQuoteKeyword(keyword)
var keywords []string
if options.Keyword != "" {
tokens, err := options.Tokens()
if err != nil {
return nil, err
}
for _, token := range tokens {
if !token.Fuzzy {
// to make it a phrase search, we have to quote the keyword(s)
// https://www.meilisearch.com/docs/reference/api/search#phrase-search
token.Term = doubleQuoteKeyword(token.Term)
}
// internal.BoolOptShould (Default, requires no modifications)
// internal.BoolOptMust (Not supported by meilisearch)
if token.Kind == internal.BoolOptNot {
token.Term = "-" + token.Term
}
keywords = append(keywords, token.Term)
}
}
searchRes, err := b.inner.Client.Index(b.inner.VersionedIndexName()).Search(keyword, &meilisearch.SearchRequest{
Filter: query.Statement(),
Limit: int64(limit),
Offset: int64(skip),
Sort: sortBy,
MatchingStrategy: meilisearch.All,
})
searchRes, err := b.inner.Client.Index(b.inner.VersionedIndexName()).
Search(strings.Join(keywords, " "), &meilisearch.SearchRequest{
Filter: query.Statement(),
Limit: int64(limit),
Offset: int64(skip),
Sort: sortBy,
MatchingStrategy: meilisearch.All,
})
if err != nil {
return nil, err
}