Improve the Match Query (#541)

apache · Sep 23, 2024 · b8012f2 · b8012f2
1 parent a1d882a
commit b8012f2
Show file tree

Hide file tree

Showing 23 changed files with 534 additions and 165 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -27,6 +27,8 @@ Release Notes.
 - Add HTTP health check endpoint for the data node.
 - Add slow query log for the distributed query and local query.
 - Support applying the index rule to the tag belonging to the entity.
+- Add search analyzer "url" which breaks test into tokens at any non-letter and non-digit character.
+- Introduce "match_option" to the "match" query.
 
 ### Bugs
 

diff --git a/api/proto/banyandb/database/v1/schema.proto b/api/proto/banyandb/database/v1/schema.proto
@@ -168,19 +168,16 @@ message IndexRule {
   Type type = 3 [(validate.rules).enum.defined_only = true];
   // updated_at indicates when the IndexRule is updated
   google.protobuf.Timestamp updated_at = 4;
-  enum Analyzer {
-    ANALYZER_UNSPECIFIED = 0;
-    // Keyword analyzer is a “noop” analyzer which returns the entire input string as a single token.
-    ANALYZER_KEYWORD = 1;
-    // Standard analyzer provides grammar based tokenization
-    ANALYZER_STANDARD = 2;
-    // Simple analyzer breaks text into tokens at any non-letter character,
-    // such as numbers, spaces, hyphens and apostrophes, discards non-letter characters,
-    // and changes uppercase to lowercase.
-    ANALYZER_SIMPLE = 3;
-  }
+
   // analyzer analyzes tag value to support the full-text searching for TYPE_INVERTED indices.
-  Analyzer analyzer = 5;
+  // available analyzers are:
+  // - "standard" provides grammar based tokenization
+  // - "simple" breaks text into tokens at any non-letter character,
+  //            such as numbers, spaces, hyphens and apostrophes, discards non-letter characters,
+  //            and changes uppercase to lowercase.
+  // - "keyword" is a “noop” analyzer which returns the entire input string as a single token.
+  // - "url" breaks test into tokens at any non-letter and non-digit character.
+  string analyzer = 5;
   // no_sort indicates whether the index is not for sorting.
   bool no_sort = 6;
 }

diff --git a/api/proto/banyandb/model/v1/query.proto b/api/proto/banyandb/model/v1/query.proto
@@ -67,6 +67,16 @@ message Condition {
   string name = 1;
   BinaryOp op = 2;
   TagValue value = 3;
+  message MatchOption {
+    string analyzer = 1;
+    enum Operator {
+      OPERATOR_UNSPECIFIED = 0;
+      OPERATOR_AND = 1;
+      OPERATOR_OR = 2;
+    }
+    Operator operator = 2;
+  }
+  MatchOption match_option = 4;
 }
 
 // tag_families are indexed.

diff --git a/docs/api-reference.md b/docs/api-reference.md
@@ -44,6 +44,7 @@
 
 - [banyandb/model/v1/query.proto](#banyandb_model_v1_query-proto)
     - [Condition](#banyandb-model-v1-Condition)
+    - [Condition.MatchOption](#banyandb-model-v1-Condition-MatchOption)
     - [Criteria](#banyandb-model-v1-Criteria)
     - [LogicalExpression](#banyandb-model-v1-LogicalExpression)
     - [QueryOrder](#banyandb-model-v1-QueryOrder)
@@ -54,6 +55,7 @@
     - [TimeRange](#banyandb-model-v1-TimeRange)
 
     - [Condition.BinaryOp](#banyandb-model-v1-Condition-BinaryOp)
+    - [Condition.MatchOption.Operator](#banyandb-model-v1-Condition-MatchOption-Operator)
     - [LogicalExpression.LogicalOp](#banyandb-model-v1-LogicalExpression-LogicalOp)
     - [Sort](#banyandb-model-v1-Sort)
 
@@ -73,7 +75,6 @@
     - [CompressionMethod](#banyandb-database-v1-CompressionMethod)
     - [EncodingMethod](#banyandb-database-v1-EncodingMethod)
     - [FieldType](#banyandb-database-v1-FieldType)
-    - [IndexRule.Analyzer](#banyandb-database-v1-IndexRule-Analyzer)
     - [IndexRule.Type](#banyandb-database-v1-IndexRule-Type)
     - [TagType](#banyandb-database-v1-TagType)
 
@@ -745,6 +746,23 @@ while for 1:N BinaryOp, values can be an array with length &gt;= 1.
 | name | [string](#string) |  |  |
 | op | [Condition.BinaryOp](#banyandb-model-v1-Condition-BinaryOp) |  |  |
 | value | [TagValue](#banyandb-model-v1-TagValue) |  |  |
+| match_option | [Condition.MatchOption](#banyandb-model-v1-Condition-MatchOption) |  |  |
+
+
+
+
+
+
+<a name="banyandb-model-v1-Condition-MatchOption"></a>
+
+### Condition.MatchOption
+
+
+
+| Field | Type | Label | Description |
+| ----- | ---- | ----- | ----------- |
+| analyzer | [string](#string) |  |  |
+| operator | [Condition.MatchOption.Operator](#banyandb-model-v1-Condition-MatchOption-Operator) |  |  |
 
 
 
@@ -914,6 +932,19 @@ Each item in a string array is seen as a token instead of a query expression.
 
 
 
+<a name="banyandb-model-v1-Condition-MatchOption-Operator"></a>
+
+### Condition.MatchOption.Operator
+
+
+| Name | Number | Description |
+| ---- | ------ | ----------- |
+| OPERATOR_UNSPECIFIED | 0 |  |
+| OPERATOR_AND | 1 |  |
+| OPERATOR_OR | 2 |  |
+
+
+
 <a name="banyandb-model-v1-LogicalExpression-LogicalOp"></a>
 
 ### LogicalExpression.LogicalOp
@@ -1001,7 +1032,7 @@ IndexRule should bind to a subject through an IndexRuleBinding to generate prope
 | tags | [string](#string) | repeated | tags are the combination that refers to an indexed object If the elements in tags are more than 1, the object will generate a multi-tag index Caveat: All tags in a multi-tag MUST have an identical IndexType |
 | type | [IndexRule.Type](#banyandb-database-v1-IndexRule-Type) |  | type is the IndexType of this IndexObject. |
 | updated_at | [google.protobuf.Timestamp](#google-protobuf-Timestamp) |  | updated_at indicates when the IndexRule is updated |
-| analyzer | [IndexRule.Analyzer](#banyandb-database-v1-IndexRule-Analyzer) |  | analyzer analyzes tag value to support the full-text searching for TYPE_INVERTED indices. |
+| analyzer | [string](#string) |  | analyzer analyzes tag value to support the full-text searching for TYPE_INVERTED indices. available analyzers are: - &#34;standard&#34; provides grammar based tokenization - &#34;simple&#34; breaks text into tokens at any non-letter character, such as numbers, spaces, hyphens and apostrophes, discards non-letter characters, and changes uppercase to lowercase. - &#34;keyword&#34; is a “noop” analyzer which returns the entire input string as a single token. - &#34;url&#34; breaks test into tokens at any non-letter and non-digit character. |
 | no_sort | [bool](#bool) |  | no_sort indicates whether the index is not for sorting. |
 
 
@@ -1198,20 +1229,6 @@ TopNAggregation generates offline TopN statistics for a measure&#39;s TopN appro
 
 
 
-<a name="banyandb-database-v1-IndexRule-Analyzer"></a>
-
-### IndexRule.Analyzer
-
-
-| Name | Number | Description |
-| ---- | ------ | ----------- |
-| ANALYZER_UNSPECIFIED | 0 |  |
-| ANALYZER_KEYWORD | 1 | Keyword analyzer is a “noop” analyzer which returns the entire input string as a single token. |
-| ANALYZER_STANDARD | 2 | Standard analyzer provides grammar based tokenization |
-| ANALYZER_SIMPLE | 3 | Simple analyzer breaks text into tokens at any non-letter character, such as numbers, spaces, hyphens and apostrophes, discards non-letter characters, and changes uppercase to lowercase. |
-
-
-
 <a name="banyandb-database-v1-IndexRule-Type"></a>
 
 ### IndexRule.Type

diff --git a/docs/interacting/bydbctl/query/filter-operation.md b/docs/interacting/bydbctl/query/filter-operation.md
@@ -65,6 +65,41 @@ criteria:
         value: "us"
 ```
 
+You can set a `match_option` to control the behavior of the match operation. The following are the available options:
+
+- `analyzer`: The analyzer to use for the match operation. If not set, the analyzer defined in the index rule will be used. Available options are defined in the [IndexRules](../schema/index-rule.md).
+- `operator`: The operator to use for the match operation. The default value is `OPERATOR_OR`. Available options are `OPERATOR_OR` and `OPERATOR_AND`.
+
+If you want to use a different analyzer and operator, you can set the `match_option` as follows:
+
+```shell
+criteria:
+  condition:
+    name: "name"
+    op: "BINARY_OP_MATCH"
+    value:
+      str:
+        value: "service-1"
+    match_option:
+      analyzer: "url"
+      operator: "OPERATOR_AND"
+```
+
+Considering the data with the following tags:
+
+```shell
+{
+  "name": "service-1"
+}
+{
+  "name": "service-2"
+}
+```
+
+The above query will return the data with the tag `name` that contains both `service` and `1`, which is `service-1`.
+
+If you set the `operator` to `OPERATOR_OR`, the query will return the data with the tag `name` that contains either `service` or `1`, which is `service-1` and `service-2`.
+
 ## [LogicalExpression.LogicalOp](../../../api-reference.md#logicalexpressionlogicalop)
 Logical operation is used to combine multiple conditions.
 

diff --git a/docs/interacting/bydbctl/schema/index-rule.md b/docs/interacting/bydbctl/schema/index-rule.md
@@ -53,8 +53,8 @@ EOF
 
 This YAML creates an index rule which uses the tag `trace_id` to generate a `TYPE_INVERTED` index.
 
-The `analyzer` field is optional. If it is not set, the default value is `ANALYZER_UNSPECIFIED`.
-We can set it to `ANALYZER_KEYWORD` to specify the analyzer. More analyzers can refer to the [API Reference](../../../api-reference.md#indexruleanalyzer).
+The `analyzer` field is optional. If it is not set, the default value is an empty string.
+We can set it to `url` to specify the analyzer. More analyzers can refer to the [API Reference](../../../api-reference.md#indexruleanalyzer).
 ```shell
 bydbctl indexRule create -f - <<EOF
 metadata:
@@ -63,7 +63,7 @@ metadata:
 tags:
 - trace_id
 type: TYPE_INVERTED
-analyzer: ANALYZER_KEYWORD
+analyzer: url
 EOF
 ```
 

diff --git a/pkg/index/index.go b/pkg/index/index.go
@@ -33,11 +33,24 @@ import (
 	"github.com/apache/skywalking-banyandb/pkg/timestamp"
 )
 
+const (
+	// AnalyzerUnspecified represents an unspecified analyzer.
+	AnalyzerUnspecified = ""
+	// AnalyzerKeyword is a “noop” analyzer which returns the entire input string as a single token.
+	AnalyzerKeyword = "keyword"
+	// AnalyzerSimple breaks text into tokens at any non-letter character.
+	AnalyzerSimple = "simple"
+	// AnalyzerStandard provides grammar based tokenization.
+	AnalyzerStandard = "standard"
+	// AnalyzerURL breaks test into tokens at any non-letter and non-digit character.
+	AnalyzerURL = "url"
+)
+
 // FieldKey is the key of field in a document.
 type FieldKey struct {
+	Analyzer    string
 	SeriesID    common.SeriesID
 	IndexRuleID uint32
-	Analyzer    databasev1.IndexRule_Analyzer
 }
 
 // Marshal encodes f to string.
@@ -168,7 +181,7 @@ type FieldIterable interface {
 // Searcher allows searching a field either by its key or by its key and term.
 type Searcher interface {
 	FieldIterable
-	Match(fieldKey FieldKey, match []string) (list posting.List, err error)
+	Match(fieldKey FieldKey, match []string, opts *modelv1.Condition_MatchOption) (list posting.List, err error)
 	MatchField(fieldKey FieldKey) (list posting.List, err error)
 	MatchTerms(field Field) (list posting.List, err error)
 	Range(fieldKey FieldKey, opts RangeOpts) (list posting.List, err error)

diff --git a/pkg/index/inverted/analyzer.go b/pkg/index/inverted/analyzer.go
@@ -0,0 +1,56 @@
+// Licensed to Apache Software Foundation (ASF) under one or more contributor
+// license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright
+// ownership. Apache Software Foundation (ASF) licenses this file to you under
+// the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package inverted
+
+import (
+	"bytes"
+	"unicode"
+
+	"github.com/blugelabs/bluge/analysis"
+	"github.com/blugelabs/bluge/analysis/tokenizer"
+)
+
+func newURLAnalyzer() *analysis.Analyzer {
+	return &analysis.Analyzer{
+		Tokenizer: tokenizer.NewCharacterTokenizer(func(r rune) bool {
+			return unicode.IsLetter(r) || unicode.IsNumber(r)
+		}),
+		TokenFilters: []analysis.TokenFilter{
+			newAlphanumericFilter(),
+		},
+	}
+}
+
+type alphanumericFilter struct{}
+
+func newAlphanumericFilter() *alphanumericFilter {
+	return &alphanumericFilter{}
+}
+
+func (f *alphanumericFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		termRunes := []rune{}
+		for _, r := range bytes.Runes(token.Term) {
+			if unicode.IsLetter(r) || unicode.IsNumber(r) {
+				termRunes = append(termRunes, r)
+			}
+		}
+		token.Term = analysis.BuildTermFromRunes(termRunes)
+	}
+	return input
+}