Skip to content

Commit ed884fa

Browse files
authored
[CLN] Rename $matches to $regex (#4506)
## Description of changes _Summarize the changes made by this PR._ - Improvements & Bug fixes - Renames `$matches` and `$not_matches` to `$regex` and `$not_regex` - New functionality - N/A ## Test plan _How are these changes tested?_ - [ ] Tests pass locally with `pytest` for python, `yarn test` for js, `cargo test` for rust ## Documentation Changes _Are all docstrings for user-facing APIs updated if required? Do we need to make documentation changes in the [docs section](https://github.com/chroma-core/chroma/tree/main/docs/docs.trychroma.com)?_
1 parent 01401ab commit ed884fa

File tree

12 files changed

+103
-99
lines changed

12 files changed

+103
-99
lines changed

chromadb/api/types.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -827,13 +827,13 @@ def validate_where_document(where_document: WhereDocument) -> None:
827827
if operator not in [
828828
"$contains",
829829
"$not_contains",
830-
"$matches",
831-
"$not_matches",
830+
"$regex",
831+
"$not_regex",
832832
"$and",
833833
"$or",
834834
]:
835835
raise ValueError(
836-
f"Expected where document operator to be one of $contains, $and, $or, got {operator}"
836+
f"Expected where document operator to be one of $contains, $not_contains, $regex, $not_regex, $and, $or, got {operator}"
837837
)
838838
if operator == "$and" or operator == "$or":
839839
if not isinstance(operand, list):
@@ -846,14 +846,14 @@ def validate_where_document(where_document: WhereDocument) -> None:
846846
)
847847
for where_document_expression in operand:
848848
validate_where_document(where_document_expression)
849-
# Value is a $contains operator
849+
# Value is $contains/$not_contains/$regex/$not_regex operator
850850
elif not isinstance(operand, str):
851851
raise ValueError(
852-
f"Expected where document operand value for operator $contains to be a str, got {operand}"
852+
f"Expected where document operand value for operator {operator} to be a str, got {operand}"
853853
)
854854
elif len(operand) == 0:
855855
raise ValueError(
856-
"Expected where document operand value for operator $contains to be a non-empty str"
856+
f"Expected where document operand value for operator {operator} to be a non-empty str"
857857
)
858858

859859

chromadb/base_types.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@
3131
WhereDocumentOperator = Union[
3232
Literal["$contains"],
3333
Literal["$not_contains"],
34-
Literal["$matches"],
35-
Literal["$not_matches"],
34+
Literal["$regex"],
35+
Literal["$not_regex"],
3636
LogicalOperator,
3737
]
3838
WhereDocument = Dict[WhereDocumentOperator, Union[str, List["WhereDocument"]]]

idl/chromadb/proto/chroma.proto

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,8 +165,8 @@ message DirectWhereDocument {
165165
enum WhereDocumentOperator {
166166
CONTAINS = 0;
167167
NOT_CONTAINS = 1;
168-
MATCHES = 2;
169-
NOT_MATCHES = 3;
168+
REGEX = 2;
169+
NOT_REGEX = 3;
170170
}
171171

172172
// A branch-node `WhereDocument` node has a list of children.

rust/index/src/fulltext/types.rs

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -428,25 +428,22 @@ impl<'me> FullTextIndexReader<'me> {
428428
}
429429

430430
#[async_trait::async_trait]
431-
impl<'me> NgramLiteralProvider<FullTextIndexError> for FullTextIndexReader<'me> {
431+
impl<'reader> NgramLiteralProvider<FullTextIndexError> for FullTextIndexReader<'reader> {
432432
fn maximum_branching_factor(&self) -> usize {
433433
6
434434
}
435435

436-
async fn lookup_ngram_range<'fts, NgramRange>(
437-
&'fts self,
436+
async fn lookup_ngram_range<'me, NgramRange>(
437+
&'me self,
438438
ngram_range: NgramRange,
439-
) -> Result<Vec<(&'fts str, u32, RoaringBitmap)>, FullTextIndexError>
439+
) -> Result<Vec<(&'me str, u32, &'me [u32])>, FullTextIndexError>
440440
where
441-
NgramRange: Clone + RangeBounds<&'fts str> + Send + Sync,
441+
NgramRange: Clone + RangeBounds<&'me str> + Send + Sync,
442442
{
443443
Ok(self
444444
.posting_lists_blockfile_reader
445445
.get_range(ngram_range, ..)
446-
.await?
447-
.into_iter()
448-
.map(|(ngram, doc, pos)| (ngram, doc, pos.iter().collect()))
449-
.collect())
446+
.await?)
450447
}
451448
}
452449

rust/segment/src/sqlite_metadata.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -487,8 +487,8 @@ impl IntoSqliteExpr for DocumentExpression {
487487
match self.operator {
488488
DocumentOperator::Contains => doc_contains,
489489
DocumentOperator::NotContains => doc_contains.not(),
490-
DocumentOperator::Matches => todo!("Implement Regex matching. The result must be a not-nullable boolean (use `<expr>.is(true)`)"),
491-
DocumentOperator::NotMatches => todo!("Implement negated Regex matching. This must be exact opposite of Regex matching (use `<expr>.not()`)"),
490+
DocumentOperator::Regex => todo!("Implement Regex matching. The result must be a not-nullable boolean (use `<expr>.is(true)`)"),
491+
DocumentOperator::NotRegex => todo!("Implement negated Regex matching. This must be exact opposite of Regex matching (use `<expr>.not()`)"),
492492
}
493493
}
494494
}

rust/segment/src/test.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -477,10 +477,10 @@ impl CheckRecord for DocumentExpression {
477477
DocumentOperator::NotContains => {
478478
!document.is_some_and(|doc| doc.contains(&self.pattern.replace("%", "")))
479479
}
480-
DocumentOperator::Matches => {
480+
DocumentOperator::Regex => {
481481
document.is_some_and(|doc| Regex::new(&self.pattern).unwrap().is_match(doc))
482482
}
483-
DocumentOperator::NotMatches => {
483+
DocumentOperator::NotRegex => {
484484
!document.is_some_and(|doc| Regex::new(&self.pattern).unwrap().is_match(doc))
485485
}
486486
}

rust/types/src/metadata.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -693,16 +693,16 @@ impl From<DocumentExpression> for chroma_proto::DirectWhereDocument {
693693
pub enum DocumentOperator {
694694
Contains,
695695
NotContains,
696-
Matches,
697-
NotMatches,
696+
Regex,
697+
NotRegex,
698698
}
699699
impl From<chroma_proto::WhereDocumentOperator> for DocumentOperator {
700700
fn from(value: chroma_proto::WhereDocumentOperator) -> Self {
701701
match value {
702702
chroma_proto::WhereDocumentOperator::Contains => Self::Contains,
703703
chroma_proto::WhereDocumentOperator::NotContains => Self::NotContains,
704-
chroma_proto::WhereDocumentOperator::Matches => Self::Matches,
705-
chroma_proto::WhereDocumentOperator::NotMatches => Self::NotMatches,
704+
chroma_proto::WhereDocumentOperator::Regex => Self::Regex,
705+
chroma_proto::WhereDocumentOperator::NotRegex => Self::NotRegex,
706706
}
707707
}
708708
}
@@ -712,8 +712,8 @@ impl From<DocumentOperator> for chroma_proto::WhereDocumentOperator {
712712
match value {
713713
DocumentOperator::Contains => Self::Contains,
714714
DocumentOperator::NotContains => Self::NotContains,
715-
DocumentOperator::Matches => Self::Matches,
716-
DocumentOperator::NotMatches => Self::NotMatches,
715+
DocumentOperator::Regex => Self::Regex,
716+
DocumentOperator::NotRegex => Self::NotRegex,
717717
}
718718
}
719719
}

rust/types/src/regex/hir.rs

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,6 @@ pub enum ChromaHir {
2424
Alternation(Vec<ChromaHir>),
2525
}
2626

27-
impl ChromaHir {
28-
pub fn contains_ngram_literal(&self, n: usize) -> bool {
29-
match self {
30-
ChromaHir::Empty | ChromaHir::Class(_) => false,
31-
ChromaHir::Literal(s) => s.len() >= n,
32-
ChromaHir::Repetition { min, max: _, sub } => *min > 0 && sub.contains_ngram_literal(n),
33-
ChromaHir::Concat(hirs) => hirs.iter().any(|hir| hir.contains_ngram_literal(n)),
34-
ChromaHir::Alternation(hirs) => hirs.iter().all(|hir| hir.contains_ngram_literal(n)),
35-
}
36-
}
37-
}
38-
3927
impl TryFrom<hir::Hir> for ChromaHir {
4028
type Error = ChromaRegexError;
4129

rust/types/src/regex/literal_expr.rs

Lines changed: 56 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
use std::{collections::HashMap, ops::RangeBounds};
1+
use std::{
2+
collections::{HashMap, HashSet},
3+
ops::RangeBounds,
4+
};
25

36
use regex_syntax::hir::ClassUnicode;
47
use roaring::RoaringBitmap;
@@ -27,6 +30,22 @@ pub enum LiteralExpr {
2730
Alternation(Vec<LiteralExpr>),
2831
}
2932

33+
impl LiteralExpr {
34+
pub fn contains_ngram_literal(&self, n: usize, max_literal_width: usize) -> bool {
35+
match self {
36+
LiteralExpr::Literal(literals) => literals
37+
.split(|lit| lit.width() > max_literal_width)
38+
.any(|chunk| chunk.len() >= n),
39+
LiteralExpr::Concat(literal_exprs) => literal_exprs
40+
.iter()
41+
.any(|expr| expr.contains_ngram_literal(n, max_literal_width)),
42+
LiteralExpr::Alternation(literal_exprs) => literal_exprs
43+
.iter()
44+
.all(|expr| expr.contains_ngram_literal(n, max_literal_width)),
45+
}
46+
}
47+
}
48+
3049
impl From<ChromaHir> for LiteralExpr {
3150
fn from(value: ChromaHir) -> Self {
3251
match value {
@@ -35,10 +54,12 @@ impl From<ChromaHir> for LiteralExpr {
3554
Self::Literal(literal.chars().map(Literal::Char).collect())
3655
}
3756
ChromaHir::Class(class_unicode) => Self::Literal(vec![Literal::Class(class_unicode)]),
38-
ChromaHir::Repetition { min, max: _, sub } => {
57+
ChromaHir::Repetition { min, max, sub } => {
3958
let mut repeat = vec![*sub; min as usize];
40-
// Append a breakpoint Hir to prevent merge with literal on the right
41-
repeat.push(ChromaHir::Alternation(vec![ChromaHir::Empty]));
59+
if max.is_none() || max.is_some_and(|m| m > min) {
60+
// Append a breakpoint Hir to prevent merge with literal on the right
61+
repeat.push(ChromaHir::Alternation(vec![ChromaHir::Empty]));
62+
}
4263
ChromaHir::Concat(repeat).into()
4364
}
4465
ChromaHir::Concat(hirs) => {
@@ -75,7 +96,7 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
7596
async fn lookup_ngram_range<'me, NgramRange>(
7697
&'me self,
7798
ngram_range: NgramRange,
78-
) -> Result<Vec<(&'me str, u32, RoaringBitmap)>, E>
99+
) -> Result<Vec<(&'me str, u32, &'me [u32])>, E>
79100
where
80101
NgramRange: Clone + RangeBounds<&'me str> + Send + Sync;
81102

@@ -84,10 +105,10 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
84105
async fn match_literal_with_mask(
85106
&self,
86107
literals: &[Literal],
87-
mask: Option<&RoaringBitmap>,
88-
) -> Result<Option<RoaringBitmap>, E> {
108+
mask: Option<&HashSet<u32>>,
109+
) -> Result<HashSet<u32>, E> {
89110
if mask.is_some_and(|m| m.is_empty()) {
90-
return Ok(mask.cloned());
111+
return Ok(HashSet::new());
91112
}
92113

93114
let (initial_literals, remaining_literals) = literals.split_at(N);
@@ -115,7 +136,7 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
115136
});
116137

117138
// ngram suffix -> doc_id -> position
118-
let mut suffix_doc_pos: HashMap<Vec<char>, HashMap<u32, RoaringBitmap>> = HashMap::new();
139+
let mut suffix_doc_pos: HashMap<Vec<char>, HashMap<u32, HashSet<u32>>> = HashMap::new();
119140
for ngram in initial_ngrams {
120141
let ngram_string = ngram.iter().collect::<String>();
121142
let ngram_doc_pos = self
@@ -128,12 +149,13 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
128149

129150
let suffix = ngram[1..].to_vec();
130151
for (_, doc_id, pos) in ngram_doc_pos {
131-
if mask.map(|m| m.contains(doc_id)).unwrap_or(mask.is_none()) {
132-
*suffix_doc_pos
152+
if mask.is_none() || mask.is_some_and(|m| m.contains(&doc_id)) {
153+
suffix_doc_pos
133154
.entry(suffix.clone())
134155
.or_default()
135156
.entry(doc_id)
136-
.or_default() |= pos;
157+
.or_default()
158+
.extend(pos);
137159
}
138160
}
139161
}
@@ -142,7 +164,7 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
142164
if suffix_doc_pos.is_empty() {
143165
break;
144166
}
145-
let mut new_suffix_doc_pos: HashMap<Vec<char>, HashMap<u32, RoaringBitmap>> =
167+
let mut new_suffix_doc_pos: HashMap<Vec<char>, HashMap<u32, HashSet<u32>>> =
146168
HashMap::new();
147169
for (mut suffix, doc_pos) in suffix_doc_pos {
148170
let ngram_ranges = match literal {
@@ -170,27 +192,19 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
170192
.await?;
171193
for (ngram, doc_id, next_pos) in ngram_doc_pos {
172194
if let Some(pos) = doc_pos.get(&doc_id) {
173-
if ngram.chars().last().is_some_and(|c| match literal {
174-
Literal::Char(literal_char) => c == *literal_char,
175-
Literal::Class(class_unicode) => {
176-
class_unicode.iter().any(|r| r.start() <= c && c <= r.end())
177-
}
178-
}) {
179-
// SAFETY(Sicheng): The RoaringBitmap iterator should be sorted
180-
let valid_next_pos = RoaringBitmap::from_sorted_iter(
181-
pos.iter()
182-
.filter_map(|p| next_pos.contains(p + 1).then_some(p + 1)),
183-
)
184-
.expect("RoaringBitmap iterator should be sorted");
185-
186-
if !valid_next_pos.is_empty() {
187-
let new_suffix = ngram.chars().skip(1).collect();
188-
*new_suffix_doc_pos
189-
.entry(new_suffix)
190-
.or_default()
191-
.entry(doc_id)
192-
.or_default() |= valid_next_pos;
193-
}
195+
let next_pos_set: HashSet<&u32> = HashSet::from_iter(next_pos);
196+
let mut valid_next_pos = pos
197+
.iter()
198+
.filter_map(|p| next_pos_set.contains(&(p + 1)).then_some(p + 1))
199+
.peekable();
200+
if valid_next_pos.peek().is_some() {
201+
let new_suffix = ngram.chars().skip(1).collect();
202+
new_suffix_doc_pos
203+
.entry(new_suffix)
204+
.or_default()
205+
.entry(doc_id)
206+
.or_default()
207+
.extend(valid_next_pos);
194208
}
195209
}
196210
}
@@ -203,16 +217,16 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
203217
.into_values()
204218
.flat_map(|doc_pos| doc_pos.into_keys())
205219
.collect();
206-
Ok(Some(result))
220+
Ok(result)
207221
}
208222

209223
// Return the documents matching the literal expression. The search space is restricted to the documents in the mask if specified
210224
// If all documents could match the literal expression, Ok(None) is returned
211225
async fn match_literal_expression_with_mask(
212226
&self,
213227
literal_expression: &LiteralExpr,
214-
mask: Option<&RoaringBitmap>,
215-
) -> Result<Option<RoaringBitmap>, E> {
228+
mask: Option<&HashSet<u32>>,
229+
) -> Result<Option<HashSet<u32>>, E> {
216230
match literal_expression {
217231
LiteralExpr::Literal(literals) => {
218232
let mut result = mask.cloned();
@@ -221,7 +235,7 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
221235
break;
222236
}
223237
if query.len() >= N {
224-
result = self.match_literal_with_mask(query, result.as_ref()).await?;
238+
result = Some(self.match_literal_with_mask(query, result.as_ref()).await?);
225239
}
226240
}
227241
Ok(result)
@@ -239,17 +253,17 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
239253
Ok(result)
240254
}
241255
LiteralExpr::Alternation(literal_exprs) => {
242-
let mut result = RoaringBitmap::new();
256+
let mut result = Vec::new();
243257
for expr in literal_exprs {
244258
if let Some(matching_docs) =
245259
self.match_literal_expression_with_mask(expr, mask).await?
246260
{
247-
result |= matching_docs;
261+
result.extend(matching_docs);
248262
} else {
249263
return Ok(mask.cloned());
250264
}
251265
}
252-
Ok(Some(result))
266+
Ok(Some(HashSet::from_iter(result)))
253267
}
254268
}
255269
}
@@ -262,6 +276,7 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
262276
) -> Result<Option<RoaringBitmap>, E> {
263277
self.match_literal_expression_with_mask(literal_expression, None)
264278
.await
279+
.map(|res| res.map(RoaringBitmap::from_iter))
265280
}
266281

267282
fn can_match_exactly(&self, literal_expression: &LiteralExpr) -> bool {

rust/types/src/where_parsing.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use crate::regex::literal_expr::LiteralExpr;
12
use crate::regex::{ChromaRegex, ChromaRegexError};
23
use crate::{CompositeExpression, DocumentOperator, MetadataExpression, PrimitiveOperator, Where};
34
use chroma_error::{ChromaError, ErrorCodes};
@@ -140,16 +141,16 @@ pub fn parse_where_document(json_payload: &Value) -> Result<Where, WhereValidati
140141
let operator_type = match key.as_str() {
141142
"$contains" => DocumentOperator::Contains,
142143
"$not_contains" => DocumentOperator::NotContains,
143-
"$matches" => DocumentOperator::Matches,
144-
"$not_matches" => DocumentOperator::NotMatches,
144+
"$regex" => DocumentOperator::Regex,
145+
"$not_regex" => DocumentOperator::NotRegex,
145146
_ => return Err(WhereValidationError::WhereDocumentClause),
146147
};
147148
if matches!(
148149
operator_type,
149-
DocumentOperator::Matches | DocumentOperator::NotMatches
150+
DocumentOperator::Regex | DocumentOperator::NotRegex
150151
) {
151152
let regex = ChromaRegex::try_from(value_str.to_string())?;
152-
if !regex.hir().contains_ngram_literal(3) {
153+
if !LiteralExpr::from(regex.hir().clone()).contains_ngram_literal(3, 6) {
153154
return Err(ChromaRegexError::PermissivePattern.into());
154155
}
155156
}

0 commit comments

Comments
 (0)