Skip to content

Commit

Permalink
feat: add vertex vector search datapoints deletion with metadata filt…
Browse files Browse the repository at this point in the history
…ers support (#559)
  • Loading branch information
Pablito2020 authored Oct 29, 2024
1 parent 431409a commit 6aa6686
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 0 deletions.
43 changes: 43 additions & 0 deletions libs/vertexai/langchain_google_vertexai/vectorstores/_searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
to_data_points,
)

MAX_DATA_POINTS = 10000


class Searcher(ABC):
"""Abstract implementation of a similarity searcher."""
Expand Down Expand Up @@ -60,6 +62,22 @@ def add_to_index(
"""
raise NotImplementedError()

@abstractmethod
def remove_datapoints(
self,
datapoint_ids: List[str],
**kwargs: Any,
) -> None:
raise NotImplementedError()

@abstractmethod
def get_datapoints_by_filter(
self,
metadata: dict,
max_datapoints: int = MAX_DATA_POINTS,
) -> List[str]:
raise NotImplementedError()

def _postprocess_response(
self, response: List[List[MatchNeighbor]]
) -> List[List[Tuple[str, float]]]:
Expand Down Expand Up @@ -105,6 +123,31 @@ def __init__(
self._staging_bucket = staging_bucket
self._stream_update = stream_update

def get_datapoints_by_filter(
self,
metadata: dict,
max_datapoints: int = MAX_DATA_POINTS,
) -> List[str]:
"""Gets all the datapoints matching the metadata filters (text only)
on the specified deployed index.
"""
index_config = self._index.to_dict()["metadata"]["config"]
embeddings = [[0.0] * int(index_config.get("dimensions", 1))]
filter_ = [
Namespace(name=key, allow_tokens=[value]) for key, value in metadata.items()
]
neighbors = self.find_neighbors(
embeddings=embeddings, k=max_datapoints, filter_=filter_
)
return [_id for (_id, _) in neighbors[0]] if neighbors else []

def remove_datapoints(
self,
datapoint_ids: List[str],
**kwargs: Any,
) -> None:
self._index.remove_datapoints(datapoint_ids=datapoint_ids)

def add_to_index(
self,
ids: List[str],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ def similarity_search_by_vector_with_score(
neighbors_list = self._searcher.find_neighbors(
embeddings=[embedding], k=k, filter_=filter, numeric_filter=numeric_filter
)
if not neighbors_list:
return []

keys = [key for key, _ in neighbors_list[0]]
distances = [distance for _, distance in neighbors_list[0]]
Expand All @@ -134,6 +136,36 @@ def similarity_search_by_vector_with_score(
message = f"Documents with ids: {missing_docs} not found in the storage"
raise ValueError(message)

def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
"""
Delete by vector ID.
Args:
ids (Optional[List[str]]): List of ids to delete.
**kwargs (Any): If added metadata={}, deletes the documents
that match the metadata filter and the parameter ids is not needed.
Returns:
Optional[bool]: True if deletion is successful.
Raises:
ValueError: If ids is None or an empty list.
RuntimeError: If an error occurs during the deletion process.
"""
metadata = kwargs.get("metadata")
if (not ids and not metadata) or (ids and metadata):
raise ValueError(
"You should provide ids (as list of id's) or a metadata"
"filter for deleting documents."
)
if metadata:
ids = self._searcher.get_datapoints_by_filter(metadata=metadata)
if not ids:
return False
try:
self._searcher.remove_datapoints(datapoint_ids=ids) # type: ignore[arg-type]
self._document_storage.mdelete(ids) # type: ignore[arg-type]
return True
except Exception as e:
raise RuntimeError(f"Error during deletion: {str(e)}") from e

def similarity_search(
self,
query: str,
Expand Down

0 comments on commit 6aa6686

Please sign in to comment.