Skip to content

Commit

Permalink
Improve the Match Query (#541)
Browse files Browse the repository at this point in the history
  • Loading branch information
hanahmily authored Sep 23, 2024
1 parent a1d882a commit b8012f2
Show file tree
Hide file tree
Showing 23 changed files with 534 additions and 165 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ Release Notes.
- Add HTTP health check endpoint for the data node.
- Add slow query log for the distributed query and local query.
- Support applying the index rule to the tag belonging to the entity.
- Add search analyzer "url" which breaks test into tokens at any non-letter and non-digit character.
- Introduce "match_option" to the "match" query.

### Bugs

Expand Down
21 changes: 9 additions & 12 deletions api/proto/banyandb/database/v1/schema.proto
Original file line number Diff line number Diff line change
Expand Up @@ -168,19 +168,16 @@ message IndexRule {
Type type = 3 [(validate.rules).enum.defined_only = true];
// updated_at indicates when the IndexRule is updated
google.protobuf.Timestamp updated_at = 4;
enum Analyzer {
ANALYZER_UNSPECIFIED = 0;
// Keyword analyzer is a “noop” analyzer which returns the entire input string as a single token.
ANALYZER_KEYWORD = 1;
// Standard analyzer provides grammar based tokenization
ANALYZER_STANDARD = 2;
// Simple analyzer breaks text into tokens at any non-letter character,
// such as numbers, spaces, hyphens and apostrophes, discards non-letter characters,
// and changes uppercase to lowercase.
ANALYZER_SIMPLE = 3;
}

// analyzer analyzes tag value to support the full-text searching for TYPE_INVERTED indices.
Analyzer analyzer = 5;
// available analyzers are:
// - "standard" provides grammar based tokenization
// - "simple" breaks text into tokens at any non-letter character,
// such as numbers, spaces, hyphens and apostrophes, discards non-letter characters,
// and changes uppercase to lowercase.
// - "keyword" is a “noop” analyzer which returns the entire input string as a single token.
// - "url" breaks test into tokens at any non-letter and non-digit character.
string analyzer = 5;
// no_sort indicates whether the index is not for sorting.
bool no_sort = 6;
}
Expand Down
10 changes: 10 additions & 0 deletions api/proto/banyandb/model/v1/query.proto
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,16 @@ message Condition {
string name = 1;
BinaryOp op = 2;
TagValue value = 3;
message MatchOption {
string analyzer = 1;
enum Operator {
OPERATOR_UNSPECIFIED = 0;
OPERATOR_AND = 1;
OPERATOR_OR = 2;
}
Operator operator = 2;
}
MatchOption match_option = 4;
}

// tag_families are indexed.
Expand Down
49 changes: 33 additions & 16 deletions docs/api-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@

- [banyandb/model/v1/query.proto](#banyandb_model_v1_query-proto)
- [Condition](#banyandb-model-v1-Condition)
- [Condition.MatchOption](#banyandb-model-v1-Condition-MatchOption)
- [Criteria](#banyandb-model-v1-Criteria)
- [LogicalExpression](#banyandb-model-v1-LogicalExpression)
- [QueryOrder](#banyandb-model-v1-QueryOrder)
Expand All @@ -54,6 +55,7 @@
- [TimeRange](#banyandb-model-v1-TimeRange)

- [Condition.BinaryOp](#banyandb-model-v1-Condition-BinaryOp)
- [Condition.MatchOption.Operator](#banyandb-model-v1-Condition-MatchOption-Operator)
- [LogicalExpression.LogicalOp](#banyandb-model-v1-LogicalExpression-LogicalOp)
- [Sort](#banyandb-model-v1-Sort)

Expand All @@ -73,7 +75,6 @@
- [CompressionMethod](#banyandb-database-v1-CompressionMethod)
- [EncodingMethod](#banyandb-database-v1-EncodingMethod)
- [FieldType](#banyandb-database-v1-FieldType)
- [IndexRule.Analyzer](#banyandb-database-v1-IndexRule-Analyzer)
- [IndexRule.Type](#banyandb-database-v1-IndexRule-Type)
- [TagType](#banyandb-database-v1-TagType)

Expand Down Expand Up @@ -745,6 +746,23 @@ while for 1:N BinaryOp, values can be an array with length >= 1.
| name | [string](#string) | | |
| op | [Condition.BinaryOp](#banyandb-model-v1-Condition-BinaryOp) | | |
| value | [TagValue](#banyandb-model-v1-TagValue) | | |
| match_option | [Condition.MatchOption](#banyandb-model-v1-Condition-MatchOption) | | |






<a name="banyandb-model-v1-Condition-MatchOption"></a>

### Condition.MatchOption



| Field | Type | Label | Description |
| ----- | ---- | ----- | ----------- |
| analyzer | [string](#string) | | |
| operator | [Condition.MatchOption.Operator](#banyandb-model-v1-Condition-MatchOption-Operator) | | |



Expand Down Expand Up @@ -914,6 +932,19 @@ Each item in a string array is seen as a token instead of a query expression.



<a name="banyandb-model-v1-Condition-MatchOption-Operator"></a>

### Condition.MatchOption.Operator


| Name | Number | Description |
| ---- | ------ | ----------- |
| OPERATOR_UNSPECIFIED | 0 | |
| OPERATOR_AND | 1 | |
| OPERATOR_OR | 2 | |



<a name="banyandb-model-v1-LogicalExpression-LogicalOp"></a>

### LogicalExpression.LogicalOp
Expand Down Expand Up @@ -1001,7 +1032,7 @@ IndexRule should bind to a subject through an IndexRuleBinding to generate prope
| tags | [string](#string) | repeated | tags are the combination that refers to an indexed object If the elements in tags are more than 1, the object will generate a multi-tag index Caveat: All tags in a multi-tag MUST have an identical IndexType |
| type | [IndexRule.Type](#banyandb-database-v1-IndexRule-Type) | | type is the IndexType of this IndexObject. |
| updated_at | [google.protobuf.Timestamp](#google-protobuf-Timestamp) | | updated_at indicates when the IndexRule is updated |
| analyzer | [IndexRule.Analyzer](#banyandb-database-v1-IndexRule-Analyzer) | | analyzer analyzes tag value to support the full-text searching for TYPE_INVERTED indices. |
| analyzer | [string](#string) | | analyzer analyzes tag value to support the full-text searching for TYPE_INVERTED indices. available analyzers are: - &#34;standard&#34; provides grammar based tokenization - &#34;simple&#34; breaks text into tokens at any non-letter character, such as numbers, spaces, hyphens and apostrophes, discards non-letter characters, and changes uppercase to lowercase. - &#34;keyword&#34; is a “noop” analyzer which returns the entire input string as a single token. - &#34;url&#34; breaks test into tokens at any non-letter and non-digit character. |
| no_sort | [bool](#bool) | | no_sort indicates whether the index is not for sorting. |


Expand Down Expand Up @@ -1198,20 +1229,6 @@ TopNAggregation generates offline TopN statistics for a measure&#39;s TopN appro



<a name="banyandb-database-v1-IndexRule-Analyzer"></a>

### IndexRule.Analyzer


| Name | Number | Description |
| ---- | ------ | ----------- |
| ANALYZER_UNSPECIFIED | 0 | |
| ANALYZER_KEYWORD | 1 | Keyword analyzer is a “noop” analyzer which returns the entire input string as a single token. |
| ANALYZER_STANDARD | 2 | Standard analyzer provides grammar based tokenization |
| ANALYZER_SIMPLE | 3 | Simple analyzer breaks text into tokens at any non-letter character, such as numbers, spaces, hyphens and apostrophes, discards non-letter characters, and changes uppercase to lowercase. |



<a name="banyandb-database-v1-IndexRule-Type"></a>

### IndexRule.Type
Expand Down
35 changes: 35 additions & 0 deletions docs/interacting/bydbctl/query/filter-operation.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,41 @@ criteria:
value: "us"
```

You can set a `match_option` to control the behavior of the match operation. The following are the available options:

- `analyzer`: The analyzer to use for the match operation. If not set, the analyzer defined in the index rule will be used. Available options are defined in the [IndexRules](../schema/index-rule.md).
- `operator`: The operator to use for the match operation. The default value is `OPERATOR_OR`. Available options are `OPERATOR_OR` and `OPERATOR_AND`.

If you want to use a different analyzer and operator, you can set the `match_option` as follows:

```shell
criteria:
condition:
name: "name"
op: "BINARY_OP_MATCH"
value:
str:
value: "service-1"
match_option:
analyzer: "url"
operator: "OPERATOR_AND"
```

Considering the data with the following tags:

```shell
{
"name": "service-1"
}
{
"name": "service-2"
}
```

The above query will return the data with the tag `name` that contains both `service` and `1`, which is `service-1`.

If you set the `operator` to `OPERATOR_OR`, the query will return the data with the tag `name` that contains either `service` or `1`, which is `service-1` and `service-2`.

## [LogicalExpression.LogicalOp](../../../api-reference.md#logicalexpressionlogicalop)
Logical operation is used to combine multiple conditions.

Expand Down
6 changes: 3 additions & 3 deletions docs/interacting/bydbctl/schema/index-rule.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ EOF

This YAML creates an index rule which uses the tag `trace_id` to generate a `TYPE_INVERTED` index.

The `analyzer` field is optional. If it is not set, the default value is `ANALYZER_UNSPECIFIED`.
We can set it to `ANALYZER_KEYWORD` to specify the analyzer. More analyzers can refer to the [API Reference](../../../api-reference.md#indexruleanalyzer).
The `analyzer` field is optional. If it is not set, the default value is an empty string.
We can set it to `url` to specify the analyzer. More analyzers can refer to the [API Reference](../../../api-reference.md#indexruleanalyzer).
```shell
bydbctl indexRule create -f - <<EOF
metadata:
Expand All @@ -63,7 +63,7 @@ metadata:
tags:
- trace_id
type: TYPE_INVERTED
analyzer: ANALYZER_KEYWORD
analyzer: url
EOF
```

Expand Down
17 changes: 15 additions & 2 deletions pkg/index/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,24 @@ import (
"github.com/apache/skywalking-banyandb/pkg/timestamp"
)

const (
// AnalyzerUnspecified represents an unspecified analyzer.
AnalyzerUnspecified = ""
// AnalyzerKeyword is a “noop” analyzer which returns the entire input string as a single token.
AnalyzerKeyword = "keyword"
// AnalyzerSimple breaks text into tokens at any non-letter character.
AnalyzerSimple = "simple"
// AnalyzerStandard provides grammar based tokenization.
AnalyzerStandard = "standard"
// AnalyzerURL breaks test into tokens at any non-letter and non-digit character.
AnalyzerURL = "url"
)

// FieldKey is the key of field in a document.
type FieldKey struct {
Analyzer string
SeriesID common.SeriesID
IndexRuleID uint32
Analyzer databasev1.IndexRule_Analyzer
}

// Marshal encodes f to string.
Expand Down Expand Up @@ -168,7 +181,7 @@ type FieldIterable interface {
// Searcher allows searching a field either by its key or by its key and term.
type Searcher interface {
FieldIterable
Match(fieldKey FieldKey, match []string) (list posting.List, err error)
Match(fieldKey FieldKey, match []string, opts *modelv1.Condition_MatchOption) (list posting.List, err error)
MatchField(fieldKey FieldKey) (list posting.List, err error)
MatchTerms(field Field) (list posting.List, err error)
Range(fieldKey FieldKey, opts RangeOpts) (list posting.List, err error)
Expand Down
56 changes: 56 additions & 0 deletions pkg/index/inverted/analyzer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Licensed to Apache Software Foundation (ASF) under one or more contributor
// license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright
// ownership. Apache Software Foundation (ASF) licenses this file to you under
// the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package inverted

import (
"bytes"
"unicode"

"github.com/blugelabs/bluge/analysis"
"github.com/blugelabs/bluge/analysis/tokenizer"
)

func newURLAnalyzer() *analysis.Analyzer {
return &analysis.Analyzer{
Tokenizer: tokenizer.NewCharacterTokenizer(func(r rune) bool {
return unicode.IsLetter(r) || unicode.IsNumber(r)
}),
TokenFilters: []analysis.TokenFilter{
newAlphanumericFilter(),
},
}
}

type alphanumericFilter struct{}

func newAlphanumericFilter() *alphanumericFilter {
return &alphanumericFilter{}
}

func (f *alphanumericFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
termRunes := []rune{}
for _, r := range bytes.Runes(token.Term) {
if unicode.IsLetter(r) || unicode.IsNumber(r) {
termRunes = append(termRunes, r)
}
}
token.Term = analysis.BuildTermFromRunes(termRunes)
}
return input
}
Loading

0 comments on commit b8012f2

Please sign in to comment.