From 66fe26bba27128d363197284aa1c5b5d7662fdc8 Mon Sep 17 00:00:00 2001 From: Aayush Kataria Date: Tue, 24 Sep 2024 08:17:21 -0700 Subject: [PATCH 1/3] Azure CosmosDB Vector Search integration --- .../retrievers/mongodb_vcore/__init__.py | 0 .../components/retrievers/nosql/__init__.py | 0 .../document_stores/mongodb_vcore/__init__.py | 0 .../mongodb_vcore/document_store.py | 345 ++++++++++++++++++ .../document_stores/mongodb_vcore/filters.py | 152 ++++++++ .../document_stores/nosql/__init__.py | 0 .../document_stores/nosql/document_store.py | 195 ++++++++++ 7 files changed, 692 insertions(+) create mode 100644 integrations/azure_cosmos_db/src/haystack_integrations/components/retrievers/mongodb_vcore/__init__.py create mode 100644 integrations/azure_cosmos_db/src/haystack_integrations/components/retrievers/nosql/__init__.py create mode 100644 integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/__init__.py create mode 100644 integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/document_store.py create mode 100644 integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/filters.py create mode 100644 integrations/azure_cosmos_db/src/haystack_integrations/document_stores/nosql/__init__.py create mode 100644 integrations/azure_cosmos_db/src/haystack_integrations/document_stores/nosql/document_store.py diff --git a/integrations/azure_cosmos_db/src/haystack_integrations/components/retrievers/mongodb_vcore/__init__.py b/integrations/azure_cosmos_db/src/haystack_integrations/components/retrievers/mongodb_vcore/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/integrations/azure_cosmos_db/src/haystack_integrations/components/retrievers/nosql/__init__.py b/integrations/azure_cosmos_db/src/haystack_integrations/components/retrievers/nosql/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/__init__.py b/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/document_store.py b/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/document_store.py new file mode 100644 index 000000000..48b16b8d3 --- /dev/null +++ b/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/document_store.py @@ -0,0 +1,345 @@ +import logging +import re +from typing import Any, Dict, List, Optional, Union + +from haystack.dataclasses.document import Document +from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError +from haystack.document_stores.types import DuplicatePolicy +from haystack.utils import Secret +from pymongo import InsertOne, MongoClient, ReplaceOne, UpdateOne +from pymongo.collection import Collection +from pymongo.driver_info import DriverInfo +from pymongo.errors import BulkWriteError + +from haystack_integrations.document_stores.mongodb_vcore.filters import _normalize_filters + +logger = logging.getLogger(__name__) + + +class AzureCosmosDBMongoVCoreDocumentStore: + """ + AzureCosmosDBMongoVCoreDocumentStore is a DocumentStore implementation that uses + [Azure CosmosDB Mongo vCore](https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/vector-search#filtered-vector-search-preview) + service. + + To connect to Azure CosmosDB Mongo vCore, you need to provide a connection string in the format: + `"mongodb+srv://{mongo_vcore_username}:{mongo_vcore_password}@{mongo_vcore_host}/?{mongo_vcore_params_string}"`. + + This connection string can be obtained from the Azure portal cosmos db account keys tab. The connection string + can be provided as an environment variable `AZURE_COSMOS_MONGO_CONNECTION_STRING` or directly as a parameter to the + `AzureCosmosDBMongoVCoreDocumentStore` constructor. + + After providing the connection string, you'll need to specify the `database_name` and `collection_name` to use. + AzureCosmosDBMongoVCoreDocumentStore has an implementation for creating a collection if one is not created already. + + You need to provide a `vector_search_index_name` for the name of the vector which would be created when the colleciton + is being created. + + The last parameter users need to provide is a `vector_search_kwargs` - used for configs for vector search in mongo vCore. + { + "vector_dimensions": 1536, + "num_lists": 1, + "similarity": "COS", + "kind": "vector-hnsw", + "m": 2, + "ef_construction": 64, + "ef_search": 40 + } + """ + def __init__( + self, + *, + mongo_connection_string: Secret = Secret.from_env_var("AZURE_COSMOS_MONGO_CONNECTION_STRING"), + database_name: str, + collection_name: str, + vector_search_index_name: str, + vector_search_kwargs: dict[str, Any], + ): + """ + Creates a new MongoDBAtlasDocumentStore instance. + + :param mongo_connection_string: MongoDB Atlas connection string in the format: + `"mongodb+srv://{mongo_vcore_username}:{mongo_vcore_password}@{mongo_vcore_host}/?{mongo_vcore_params_string}"`. + :param database_name: Name of the database to use. + :param collection_name: Name of the collection to use. To use this document store for embedding retrieval, + this collection needs to have a vector search index set up on the `embedding` field. + :param vector_search_index_name: The name of the vector search index to use for vector search operations. + :param vector_search_kwargs: Configs for vector search in mongo vCore + + :raises ValueError: If the collection name contains invalid characters. + """ + if collection_name and not bool(re.match(r"^[a-zA-Z0-9\-_]+$", collection_name)): + msg = f'Invalid collection name: "{collection_name}". It can only contain letters, numbers, -, or _.' + raise ValueError(msg) + + self.mongo_connection_string = mongo_connection_string + self.database_name = database_name + self.collection_name = collection_name + self.vector_search_index_name = vector_search_index_name + self.vector_search_kwargs = vector_search_kwargs + self._mongo_client = Optional[MongoClient] = None + self._collection = Optional[Collection] = None + + @property + def mongo_client(self) -> MongoClient: + if self._mongo_client is None: + self._mongo_client = MongoClient( + self.mongo_connection_string.resolve_value(), + appname="", + driver=DriverInfo(name="AzureCosmosDBMongoVCoreDocumentStore") + ) + return self._mongo_client + + @property + def collection(self) -> Collection: + if self._collection is None: + database = self.mongo_client[self.database_name] + if self.collection_name not in database.list_collection_names(): + # check the kind of vector search to be performed + # prepare the command accordingly + create_index_commands = {} + if self.vector_search_kwargs.get("kind") == "vector-ivf": + create_index_commands = self._get_vector_index_ivf() + elif self.vector_search_kwargs.get("kind") == "vector-hnsw": + create_index_commands = self._get_vector_index_hnsw() + database.command(create_index_commands) + self._collection = database[self.collection_name] + return self._collection + + def create_filter_index( + self, property_to_filter: str, + ) -> dict[str, Any]: + command = { + "createIndexes": self.collection_name, + "indexes": [ + { + "key": {property_to_filter: 1}, + "name": self.vector_search_index_name, + } + ], + } + + create_index_response: dict[str, Any] = self.mongo_client[self.database_name].command(command) + return create_index_response + + def count_documents(self) -> int: + """ + Returns how many documents are present in the document store. + + :returns: The number of documents in the document store. + """ + return self.collection.count_documents({}) + + def delete_documents(self, document_ids: List[str]) -> None: + """ + Deletes all documents with a matching document_ids from the document store. + + :param document_ids: the document ids to delete + """ + if not document_ids: + return + self.collection.delete_many(filter={"id": {"$in": document_ids}}) + + def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int: + """ + Writes documents into the MongoDB Atlas collection. + + :param documents: A list of Documents to write to the document store. + :param policy: The duplicate policy to use when writing documents. + :raises DuplicateDocumentError: If a document with the same ID already exists in the document store + and the policy is set to DuplicatePolicy.FAIL (or not specified). + :raises ValueError: If the documents are not of type Document. + :returns: The number of documents written to the document store. + """ + + if len(documents) > 0: + if not isinstance(documents[0], Document): + msg = "param 'documents' must contain a list of objects of type Document" + raise ValueError(msg) + + if policy == DuplicatePolicy.NONE: + policy = DuplicatePolicy.FAIL + + mongo_documents = [] + for doc in documents: + doc_dict = doc.to_dict(flatten=False) + if "sparse_embedding" in doc_dict: + sparse_embedding = doc_dict.pop("sparse_embedding", None) + if sparse_embedding: + logger.warning( + "Document %s has the `sparse_embedding` field set," + "but storing sparse embeddings in MongoDB Atlas is not currently supported." + "The `sparse_embedding` field will be ignored.", + doc.id, + ) + mongo_documents.append(doc_dict) + operations: List[Union[UpdateOne, InsertOne, ReplaceOne]] + written_docs = len(documents) + + if policy == DuplicatePolicy.SKIP: + operations = [UpdateOne({"id": doc["id"]}, {"$setOnInsert": doc}, upsert=True) for doc in mongo_documents] + existing_documents = self.collection.count_documents({"id": {"$in": [doc.id for doc in documents]}}) + written_docs -= existing_documents + elif policy == DuplicatePolicy.FAIL: + operations = [InsertOne(doc) for doc in mongo_documents] + else: + operations = [ReplaceOne({"id": doc["id"]}, upsert=True, replacement=doc) for doc in mongo_documents] + + try: + self.collection.bulk_write(operations) + except BulkWriteError as e: + msg = f"Duplicate documents found: {e.details['writeErrors']}" + raise DuplicateDocumentError(msg) from e + + return written_docs + + def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: + """ + Returns the documents that match the filters provided. + + For a detailed specification of the filters, + refer to the Haystack [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering). + + :param filters: The filters to apply. It returns only the documents that match the filters. + :returns: A list of Documents that match the given filters. + """ + filters = _normalize_filters(filters) if filters else None + documents = list(self.collection.find(filters)) + for doc in documents: + doc.pop("_id", None) # MongoDB's internal id doesn't belong into a Haystack document, so we remove it. + return [Document.from_dict(doc) for doc in documents] + + def _embedding_retrieval( + self, + query_embedding: List[float], + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + ) -> List[Document]: + """ + Find the documents that are most similar to the provided `query_embedding` by using a vector similarity metric. + + :param query_embedding: Embedding of the query + :param filters: Optional filters. + :param top_k: How many documents to return. + :returns: A list of Documents that are most similar to the given `query_embedding` + :raises ValueError: If `query_embedding` is empty. + :raises DocumentStoreError: If the retrieval of documents from MongoDB Atlas fails. + """ + pipeline: list[dict[str, Any]] = [] + if self.vector_search_kwargs.get("kind") == "vector-ivf": + pipeline = self._get_pipeline_vector_ivf(query_embedding, top_k, filters) + elif self.vector_search_kwargs.get("kind") == "vector-hnsw": + pipeline = self._get_pipeline_vector_hnsw(query_embedding, top_k, filters) + + try: + documents = list(self.collection.aggregate(pipeline)) + except Exception as e: + msg = f"Retrieval of documents from MongoDB Atlas failed: {e}" + if filters: + msg += ( + "\nMake sure that the fields used in the filters are included " + "in the `vector_search_index` configuration" + ) + raise DocumentStoreError(msg) from e + + documents = [self._mongo_doc_to_haystack_doc(doc) for doc in documents] + return documents + + def _get_vector_index_ivf(self) -> dict[str, Any]: + return { + "createIndexes": self.collection_name, + "indexes": [ + { + "name": self.vector_search_index_name, + "key": {"embedding": "cosmosSearch"}, + "cosmosSearchOptions": { + "kind": self.vector_search_kwargs.get("kind"), + "numLists": self.vector_search_kwargs.get("num_lists"), + "similarity": self.vector_search_kwargs.get("similarity"), + "dimensions": self.vector_search_kwargs.get("dimensions"), + }, + } + ], + } + + def _get_vector_index_hnsw(self) -> dict[str, Any]: + return { + "createIndexes": self.collection_name, + "indexes": [ + { + "name": self.vector_search_index_name, + "key": {"embedding": "cosmosSearch"}, + "cosmosSearchOptions": { + "kind": self.vector_search_kwargs.get("kind"), + "m": self.vector_search_kwargs.get("m"), + "efConstruction": self.vector_search_kwargs.get("ef_construction"), + "similarity": self.vector_search_kwargs.get("similarity"), + "dimensions": self.vector_search_kwargs.get("dimensions"), + }, + } + ], + } + + def _get_pipeline_vector_ivf( + self, embeddings: list[float], top_k: int, filters: Optional[dict] + ) -> list[dict[str, Any]]: + params = { + "vector": embeddings, + "path": "embedding", + "k": top_k, + } + if filters: + params["filter"] = filters + + pipeline: list[dict[str, Any]] = [ + { + "$search": { + "cosmosSearch": params, + "returnStoredSource": True, + } + }, + { + "$project": { + "similarityScore": {"$meta": "searchScore"}, + "document": "$$ROOT", + } + }, + ] + return pipeline + + def _get_pipeline_vector_hnsw( + self, embeddings: list[float], top_k: int, filters: Optional[dict] + ) -> list[dict[str, Any]]: + params = { + "vector": embeddings, + "path": "embedding", + "k": top_k, + "efSearch": self.vector_search_kwargs.get("ef_search"), + } + if filters: + params["filter"] = filters + + pipeline: list[dict[str, Any]] = [ + { + "$search": { + "cosmosSearch": params, + } + }, + { + "$project": { + "similarityScore": {"$meta": "searchScore"}, + "document": "$$ROOT", + } + }, + ] + return pipeline + + def _mongo_doc_to_haystack_doc(self, mongo_doc: Dict[str, Any]) -> Document: + """ + Converts the dictionary coming out of MongoDB into a Haystack document + + :param mongo_doc: A dictionary representing a document as stored in MongoDB + :returns: A Haystack Document object + """ + mongo_doc.pop("_id", None) + return Document.from_dict(mongo_doc) diff --git a/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/filters.py b/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/filters.py new file mode 100644 index 000000000..0b5986222 --- /dev/null +++ b/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/filters.py @@ -0,0 +1,152 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from datetime import datetime +from typing import Any, Dict + +from haystack.errors import FilterError +from pandas import DataFrame + +UNSUPPORTED_TYPES_FOR_COMPARISON = (list, DataFrame) + + +def _normalize_filters(filters: Dict[str, Any]) -> Dict[str, Any]: + """ + Converts Haystack filters to MongoDB filters. + """ + if not isinstance(filters, dict): + msg = "Filters must be a dictionary" + raise FilterError(msg) + + if "operator" not in filters and "conditions" not in filters: + msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details." + raise ValueError(msg) + + if "field" in filters: + return _parse_comparison_condition(filters) + return _parse_logical_condition(filters) + + +def _parse_logical_condition(condition: Dict[str, Any]) -> Dict[str, Any]: + if "operator" not in condition: + msg = f"'operator' key missing in {condition}" + raise FilterError(msg) + if "conditions" not in condition: + msg = f"'conditions' key missing in {condition}" + raise FilterError(msg) + + # logical conditions can be nested, so we need to parse them recursively + conditions = [] + for c in condition["conditions"]: + if "field" in c: + conditions.append(_parse_comparison_condition(c)) + else: + conditions.append(_parse_logical_condition(c)) + + operator = condition["operator"] + if operator == "AND": + return {"$and": conditions} + elif operator == "OR": + return {"$or": conditions} + elif operator == "NOT": + # MongoDB doesn't support our NOT operator (logical NAND) directly. + # we combine $nor and $and to achieve the same effect. + return {"$nor": [{"$and": conditions}]} + + msg = f"Unknown logical operator '{operator}'. Valid operators are: 'AND', 'OR', 'NOT'" + raise FilterError(msg) + + +def _parse_comparison_condition(condition: Dict[str, Any]) -> Dict[str, Any]: + field: str = condition["field"] + if "operator" not in condition: + msg = f"'operator' key missing in {condition}" + raise FilterError(msg) + if "value" not in condition: + msg = f"'value' key missing in {condition}" + raise FilterError(msg) + operator: str = condition["operator"] + value: Any = condition["value"] + + if isinstance(value, DataFrame): + value = value.to_json() + + return COMPARISON_OPERATORS[operator](field, value) + + +def _equal(field: str, value: Any) -> Dict[str, Any]: + return {field: {"$eq": value}} + + +def _not_equal(field: str, value: Any) -> Dict[str, Any]: + return {field: {"$ne": value}} + + +def _validate_type_for_comparison(value: Any) -> None: + msg = f"Cant compare {type(value)} using operators '>', '>=', '<', '<='." + if isinstance(value, UNSUPPORTED_TYPES_FOR_COMPARISON): + raise FilterError(msg) + elif isinstance(value, str): + try: + datetime.fromisoformat(value) + except (ValueError, TypeError) as exc: + msg += "\nStrings are only comparable if they are ISO formatted dates." + raise FilterError(msg) from exc + + +def _greater_than(field: str, value: Any) -> Dict[str, Any]: + _validate_type_for_comparison(value) + return {field: {"$gt": value}} + + +def _greater_than_equal(field: str, value: Any) -> Dict[str, Any]: + if value is None: + # we want {field: {"$gte": null}} to return an empty result + # $gte with null values in MongoDB returns a non-empty result, while $gt aligns with our expectations + return {field: {"$gt": value}} + + _validate_type_for_comparison(value) + return {field: {"$gte": value}} + + +def _less_than(field: str, value: Any) -> Dict[str, Any]: + _validate_type_for_comparison(value) + return {field: {"$lt": value}} + + +def _less_than_equal(field: str, value: Any) -> Dict[str, Any]: + if value is None: + # we want {field: {"$lte": null}} to return an empty result + # $lte with null values in MongoDB returns a non-empty result, while $lt aligns with our expectations + return {field: {"$lt": value}} + _validate_type_for_comparison(value) + + return {field: {"$lte": value}} + + +def _not_in(field: str, value: Any) -> Dict[str, Any]: + if not isinstance(value, list): + msg = f"{field}'s value must be a list when using 'not in' comparator in Pinecone" + raise FilterError(msg) + + return {field: {"$nin": value}} + + +def _in(field: str, value: Any) -> Dict[str, Any]: + if not isinstance(value, list): + msg = f"{field}'s value must be a list when using 'in' comparator in Pinecone" + raise FilterError(msg) + + return {field: {"$in": value}} + + +COMPARISON_OPERATORS = { + "==": _equal, + "!=": _not_equal, + ">": _greater_than, + ">=": _greater_than_equal, + "<": _less_than, + "<=": _less_than_equal, + "in": _in, + "not in": _not_in, +} diff --git a/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/nosql/__init__.py b/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/nosql/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/nosql/document_store.py b/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/nosql/document_store.py new file mode 100644 index 000000000..7d0982e8b --- /dev/null +++ b/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/nosql/document_store.py @@ -0,0 +1,195 @@ +import logging +from typing import Any, Dict, List, Optional + +from azure.cosmos import CosmosClient +from azure.cosmos.container import ContainerProxy +from haystack import Document +from haystack.document_stores.types import DuplicatePolicy + +logger = logging.getLogger(__name__) + + +class AzureCosmosNoSqlDocumentStore: + """ + AzureCosmosNoSqlDocumentStore is a DocumentStore implementation that uses [Azure CosmosDB NoSql] + (https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/vector-search) service. + + To connect to Azure CosmosDB NoSql, you need to provide a cosmosClient. After providing the connection + string, you need to provide `database_name`, `container_name`, `vector_embedding_policy`, `indexing_policy`, + `cosmos_container_properties`. + + Please refer to README.md file for detailed setup. + """ + def __init__( + self, + cosmos_client: CosmosClient, + database_name: str, + container_name: str, + vector_embedding_policy: dict[str, Any] = None, + indexing_policy: dict[str, Any] = None, + cosmos_container_properties: dict[str, Any] = None, + ): + """ + Creates a new AzureCosmosNoSqlDocumentStore instance. + + :param cosmos_client: Azure CosmosClient for NoSql account. + :param database_name: Name of the database to use. + :param container_name: Name of the container to use. + :param vector_embedding_policy: Dictionary of vector embeddings to use. + :param indexing_policy: Dictionary of indexing policies to use. + :param cosmos_container_properties: Dictionary of cosmos container properties to use. + + :raises ValueError: if the vectorIndexes or vectorEmbeddings are null in the indexing_policy + or the vector_embedding_policy. + """ + if indexing_policy["vectorIndexes"] is None or len(indexing_policy["vectorIndexes"]) == 0: + raise ValueError("vectorIndexes cannot be null or empty in the indexing_policy.") + if vector_embedding_policy is None or len(vector_embedding_policy["vectorEmbeddings"]) == 0: + raise ValueError("vectorEmbeddings cannot be null or empty in the vector_embedding_policy.") + + self.cosmos_client = cosmos_client + self.database_name = database_name + self.container_name = container_name + self.vector_embedding_policy = vector_embedding_policy + self.indexing_policy = indexing_policy + self.cosmos_container_properties = cosmos_container_properties + self._container = Optional[ContainerProxy] = None + + @property + def container(self) -> ContainerProxy: + # Create the database if it already doesn't exist + database = self.cosmos_client.create_database_if_not_exists(id=self.database_name) + if database.get_container_client(self.container_name).read() is not None: + # Create the collection if it already doesn't exist + self._container = database.create_container( + id=self.container_name, + partition_key=self.cosmos_container_properties["partition_key"], + indexing_policy=self.indexing_policy, + vector_embedding_policy=self.vector_embedding_policy, + ) + return self._container + + def count_documents(self) -> int: + """ + Returns how many documents are present in the document store. + + :returns: The number of documents in the document store. + """ + return len(list(self.container.read_all_items())) + + def delete_documents(self, document_ids: List[str]) -> None: + """ + Deletes all documents with a matching document_ids from the document store. + + :param document_ids: the document ids to delete + """ + if not document_ids: + return + for document_id in document_ids: + self.container.delete_item(document_id) + + def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> int: + """ + Writes documents into the MongoDB Atlas collection. + + :param documents: A list of Documents to write to the document store. + :param policy: The duplicate policy to use when writing documents. + :raises DuplicateDocumentError: If a document with the same ID already exists in the document store + and the policy is set to DuplicatePolicy.FAIL (or not specified). + :raises ValueError: If the documents are not of type Document. + :returns: The number of documents written to the document store. + """ + + if len(documents) > 0: + if not isinstance(documents[0], Document): + msg = "param 'documents' must contain a list of objects of type Document" + raise ValueError(msg) + + for doc in documents: + self.container.create_item(doc.to_dict()) + return len(documents) + + def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: + """ + Returns the documents that match the filters provided. + + For a detailed specification of the filters, + refer to the Haystack [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering). + + :param filters: The filters to apply. It returns only the documents that match the filters. + :returns: A list of Documents that match the given filters. + """ + query = self._filtered_query(filters) + items = list(self.container.query_items(query, enable_cross_partition_query=True)) + return [Document.from_dict(doc) for doc in items] + + def _embedding_retrieval( + self, + query_embedding: List[float], + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + ) -> List[Document]: + """ + Find the documents that are most similar to the provided `query_embedding` by using a vector similarity metric. + + :param query_embedding: Embedding of the query + :param filters: Optional filters. + :param top_k: How many documents to return. + :returns: A list of Documents that are most similar to the given `query_embedding` + :raises ValueError: If `query_embedding` is empty. + :raises DocumentStoreError: If the retrieval of documents from MongoDB Atlas fails. + """ + embedding_key = self.vector_embedding_policy["vectorEmbeddings"][0]["path"][1:] + + query = "SELECT " + # If limit_offset_clause is not specified, add TOP clause + if filters is None or filters.get("limit_offset_clause") is None: + query += "TOP @limit " + query += ( + "c.id, c.@embeddingKey, c.content, c.meta, c.score, " + "VectorDistance(c.@embeddingKey, @embeddings) AS SimilarityScore FROM c" + ) + # Add where_clause if specified + if filters is not None and filters.get("where_clause") is not None: + query += " {}".format(filters["where_clause"]) + query += " ORDER BY VectorDistance(c.@embeddingKey, @embeddings)" + # Add limit_offset_clause if specified + if filters is not None and filters.get("limit_offset_clause") is not None: + query += " {}".format(filters["limit_offset_clause"]) + parameters = [ + {"name": "@limit", "value": top_k}, + {"name": "@embeddingKey", "value": embedding_key}, + {"name": "@embeddings", "value": query_embedding}, + ] + + items = list(self.container.query_items(query, parameters=parameters, enable_cross_partition_query=True)) + nearest_results = [self._cosmos_doc_to_haystack_doc(item) for item in items] + return nearest_results + + def _filtered_query(self, filters: Dict[str, Any]) -> str: + if filters is None: + return "SELECT * FROM c" + query = "SELECT " + + if filters["top"] is not None: + query += filters["top"] + query += "* FROM c" + if filters["where"] is not None: + query += filters["where"] + if filters["order_by"] is not None: + query += filters["order_by"] + if filters["limit_offset"] is not None and filters["top"] is None: + query += filters["limit_offset"] + return query + + def _cosmos_doc_to_haystack_doc(self, cosmos_doc: Dict[str, Any]) -> Document: + """ + Converts the dictionary coming out of CosmosDB NoSql into a Haystack document + + :param cosmos_doc: A dictionary representing a document as stored in CosmosDB + :returns: A Haystack Document object + """ + cosmos_doc.pop("id", None) + return Document.from_dict(cosmos_doc) + + From e16c4ddee099a27fd3d07e0c39f9c0295f73f277 Mon Sep 17 00:00:00 2001 From: Aayush Kataria Date: Thu, 26 Sep 2024 12:03:57 -0700 Subject: [PATCH 2/3] Adding embedding retrievers --- .../retrievers/mongodb_vcore => }/__init__.py | 0 .../document_stores/nosql/__init__.py | 0 .../azure_cosmos_db_mongo_vcore/CHANGELOG.md | 12 ++ .../azure_cosmos_db_mongo_vcore/LICENSE.txt | 73 +++++++++ .../azure_cosmos_db_mongo_vcore/README.md | 43 +++++ .../pyproject.toml | 155 ++++++++++++++++++ .../retrievers/mongodb_vcore/__init__.py | 4 + .../mongodb_vcore/embedding_retriever.py | 48 ++++++ .../document_stores/mongodb_vcore/__init__.py | 3 + .../mongodb_vcore/document_store.py | 19 ++- .../document_stores/mongodb_vcore/filters.py | 0 .../tests}/__init__.py | 0 .../azure_cosmos_db_nosql/CHANGELOG.md | 12 ++ .../azure_cosmos_db_nosql/LICENSE.txt | 73 +++++++++ integrations/azure_cosmos_db_nosql/README.md | 42 +++++ .../azure_cosmos_db_nosql/pyproject.toml | 155 ++++++++++++++++++ .../components/retrievers/nosql/__init__.py | 5 + .../retrievers/nosql/embedding_retriever.py | 47 ++++++ .../document_stores/nosql/__init__.py | 3 + .../document_stores/nosql/document_store.py | 84 +++++++++- .../tests}/__init__.py | 0 21 files changed, 773 insertions(+), 5 deletions(-) rename integrations/{azure_cosmos_db/src/haystack_integrations/components/retrievers/mongodb_vcore => }/__init__.py (100%) delete mode 100644 integrations/azure_cosmos_db/src/haystack_integrations/document_stores/nosql/__init__.py create mode 100644 integrations/azure_cosmos_db_mongo_vcore/CHANGELOG.md create mode 100644 integrations/azure_cosmos_db_mongo_vcore/LICENSE.txt create mode 100644 integrations/azure_cosmos_db_mongo_vcore/README.md create mode 100644 integrations/azure_cosmos_db_mongo_vcore/pyproject.toml create mode 100644 integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/components/retrievers/mongodb_vcore/__init__.py create mode 100644 integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/components/retrievers/mongodb_vcore/embedding_retriever.py create mode 100644 integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/document_stores/mongodb_vcore/__init__.py rename integrations/{azure_cosmos_db => azure_cosmos_db_mongo_vcore}/src/haystack_integrations/document_stores/mongodb_vcore/document_store.py (95%) rename integrations/{azure_cosmos_db => azure_cosmos_db_mongo_vcore}/src/haystack_integrations/document_stores/mongodb_vcore/filters.py (100%) rename integrations/{azure_cosmos_db/src/haystack_integrations/components/retrievers/nosql => azure_cosmos_db_mongo_vcore/tests}/__init__.py (100%) create mode 100644 integrations/azure_cosmos_db_nosql/CHANGELOG.md create mode 100644 integrations/azure_cosmos_db_nosql/LICENSE.txt create mode 100644 integrations/azure_cosmos_db_nosql/README.md create mode 100644 integrations/azure_cosmos_db_nosql/pyproject.toml create mode 100644 integrations/azure_cosmos_db_nosql/src/haystack_integrations/components/retrievers/nosql/__init__.py create mode 100644 integrations/azure_cosmos_db_nosql/src/haystack_integrations/components/retrievers/nosql/embedding_retriever.py create mode 100644 integrations/azure_cosmos_db_nosql/src/haystack_integrations/document_stores/nosql/__init__.py rename integrations/{azure_cosmos_db => azure_cosmos_db_nosql}/src/haystack_integrations/document_stores/nosql/document_store.py (74%) rename integrations/{azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore => azure_cosmos_db_nosql/tests}/__init__.py (100%) diff --git a/integrations/azure_cosmos_db/src/haystack_integrations/components/retrievers/mongodb_vcore/__init__.py b/integrations/__init__.py similarity index 100% rename from integrations/azure_cosmos_db/src/haystack_integrations/components/retrievers/mongodb_vcore/__init__.py rename to integrations/__init__.py diff --git a/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/nosql/__init__.py b/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/nosql/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/integrations/azure_cosmos_db_mongo_vcore/CHANGELOG.md b/integrations/azure_cosmos_db_mongo_vcore/CHANGELOG.md new file mode 100644 index 000000000..f84288d55 --- /dev/null +++ b/integrations/azure_cosmos_db_mongo_vcore/CHANGELOG.md @@ -0,0 +1,12 @@ +# Changelog + +## [integrations/azure_cosmos_db_mongo_vcore-v0.1.0] - 2024-09-26 + +### 🚀 Features + +- AzureCosmosDBMongoVCore Document Store [#1099](#https://github.com/deepset-ai/haystack-core-integrations/pull/1099) +- `AzureCosmosDBMongoVCoreEmbeddingRetriever` [#1099](#https://github.com/deepset-ai/haystack-core-integrations/pull/1099) + +This PR will also push the docs to Readme + + diff --git a/integrations/azure_cosmos_db_mongo_vcore/LICENSE.txt b/integrations/azure_cosmos_db_mongo_vcore/LICENSE.txt new file mode 100644 index 000000000..137069b82 --- /dev/null +++ b/integrations/azure_cosmos_db_mongo_vcore/LICENSE.txt @@ -0,0 +1,73 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + + (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. + + You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/integrations/azure_cosmos_db_mongo_vcore/README.md b/integrations/azure_cosmos_db_mongo_vcore/README.md new file mode 100644 index 000000000..ce6570115 --- /dev/null +++ b/integrations/azure_cosmos_db_mongo_vcore/README.md @@ -0,0 +1,43 @@ +# mongodb-atlas-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/mongodb-atlas-haystack.svg)](https://pypi.org/project/mongodb-atlas-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mongodb-atlas-haystack.svg)](https://pypi.org/project/mongodb-atlas-haystack) + +----- + +**Table of Contents** + +- [Installation](#installation) +- [Contributing](#contributing) +- [License](#license) + +## Installation + +```console +pip install azure_cosmos_db_mongo_vcore-haystack +``` + +## Contributing + +`hatch` is the best way to interact with this project, to install it: +```sh +pip install hatch +``` + +To run the linters `ruff` and `mypy`: +``` +hatch run lint:all +``` + +To run all the tests: +``` +hatch run test +``` + +Note: you need your own Azure CosmosDB Mongo vCore account to run the tests: you can make one here: +https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/vector-search. Once you have it, export the connection string +to the env var `AZURE_COSMOS_MONGO_CONNECTION_STRING`. If you forget to do so, all the tests will be skipped. + +## License + +`mongodb-atlas-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/azure_cosmos_db_mongo_vcore/pyproject.toml b/integrations/azure_cosmos_db_mongo_vcore/pyproject.toml new file mode 100644 index 000000000..cce2fdb3c --- /dev/null +++ b/integrations/azure_cosmos_db_mongo_vcore/pyproject.toml @@ -0,0 +1,155 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "azure_cosmos_db_mongo_vcore-haystack" +dynamic = ["version"] +description = "An integration of Azure CosmosDB Mongo vCore with Haystack" +readme = "README.md" +requires-python = ">=3.8" +license = "Apache-2.0" +keywords = [] +authors = [{ name = "Aayush Kataria", email = "akataria@microsoft.com" }] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = ["haystack-ai", "pymongo[srv]"] + +[project.urls] +Source = "https://github.com/deepset-ai/haystack-core-integrations" +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/azure_cosmos_db_mongo_vcore/README.md" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/azure_cosmos_db_mongo_vcore-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/azure_cosmos_db_mongo_vcore-v[0-9]*"' + +[tool.hatch.envs.default] +dependencies = [ + "coverage[toml]>=6.5", + "pytest", + "pytest-rerunfailures", + "ipython", + "haystack-pydoc-tools", +] +[tool.hatch.envs.default.scripts] +test = "pytest {args:tests}" +test-cov = "coverage run -m pytest {args:tests}" +test-cov-retry = "test-cov --reruns 3 --reruns-delay 30 -x" +cov-report = ["- coverage combine", "coverage report"] +cov = ["test-cov", "cov-report"] +cov-retry = ["test-cov-retry", "cov-report"] +docs = ["pydoc-markdown pydoc/config.yml"] + +[[tool.hatch.envs.all.matrix]] +python = ["3.8", "3.9", "3.10", "3.11", "3.12"] + +[tool.hatch.envs.lint] +detached = true +dependencies = ["black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"] +[tool.hatch.envs.lint.scripts] +typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}" +style = ["ruff check {args:. --exclude tests/, examples/}", "black --check --diff {args:.}"] +fmt = ["black {args:.}", "ruff --fix {args:. --exclude tests/, examples/}", "style"] +all = ["style", "typing"] + +[tool.black] +target-version = ["py38"] +line-length = 120 +skip-string-normalization = true + +[tool.ruff] +target-version = "py38" +line-length = 120 +select = [ + "A", + "ARG", + "B", + "C", + "DTZ", + "E", + "EM", + "F", + "FBT", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", + "S106", + "S107", + # Ignore complexity + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.isort] +known-first-party = ["src"] + +[tool.ruff.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] +# examples can contain "print" commands +"examples/**/*" = ["T201"] + +[tool.coverage.run] +source = ["haystack_integrations"] +branch = true +parallel = false + + +[tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing = true +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + + +[[tool.mypy.overrides]] +module = ["haystack.*", "haystack_integrations.*", "pymongo.*", "pytest.*"] +ignore_missing_imports = true diff --git a/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/components/retrievers/mongodb_vcore/__init__.py b/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/components/retrievers/mongodb_vcore/__init__.py new file mode 100644 index 000000000..183705271 --- /dev/null +++ b/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/components/retrievers/mongodb_vcore/__init__.py @@ -0,0 +1,4 @@ +from integrations.azure_cosmos_db_mongo_vcore.src.haystack_integrations.components.retrievers.mongodb_vcore.embedding_retriever import \ + AzureCosmosDBMongoVCoreEmbeddingRetriever + +__all__ = ["AzureCosmosDBMongoVCoreEmbeddingRetriever"] diff --git a/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/components/retrievers/mongodb_vcore/embedding_retriever.py b/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/components/retrievers/mongodb_vcore/embedding_retriever.py new file mode 100644 index 000000000..6ac0a6ee8 --- /dev/null +++ b/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/components/retrievers/mongodb_vcore/embedding_retriever.py @@ -0,0 +1,48 @@ +from typing import Any, Dict, List, Optional + +from haystack import component, default_from_dict +from haystack.dataclasses import Document + +from haystack_integrations.document_stores.mongodb_vcore.document_store import \ + AzureCosmosDBMongoVCoreDocumentStore + + +@component +class AzureCosmosDBMongoVCoreEmbeddingRetriever: + def __init__( + self, + *, + document_store: AzureCosmosDBMongoVCoreDocumentStore, + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + ): + if not isinstance(document_store, AzureCosmosDBMongoVCoreDocumentStore): + msg = "document_store must be an instance of AzureCosmosDBMongoVCoreDocumentStore" + raise ValueError(msg) + + self.document_store = document_store + self.filters = filters or {} + self.top_k = top_k + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "AzureCosmosDBMongoVCoreEmbeddingRetriever": + data["init_parameters"]["document_store"] = AzureCosmosDBMongoVCoreDocumentStore.from_dict( + data["init_parameters"]["document_store"] + ) + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run( + self, + query_embedding: List[float], + filters: Optional[Dict[str, Any]] = None, + top_k: Optional[int] = None, + ) -> Dict[str, List[Document]]: + top_k = top_k or self.top_k + + docs = self.document_store._embedding_retrieval( + query_embedding=query_embedding, + filters=filters, + top_k=top_k, + ) + return {"documents": docs} diff --git a/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/document_stores/mongodb_vcore/__init__.py b/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/document_stores/mongodb_vcore/__init__.py new file mode 100644 index 000000000..1d1bb82c2 --- /dev/null +++ b/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/document_stores/mongodb_vcore/__init__.py @@ -0,0 +1,3 @@ +from .document_store import AzureCosmosDBMongoVCoreDocumentStore + +__all__ = ["AzureCosmosDBMongoVCoreDocumentStore"] diff --git a/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/document_store.py b/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/document_stores/mongodb_vcore/document_store.py similarity index 95% rename from integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/document_store.py rename to integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/document_stores/mongodb_vcore/document_store.py index 48b16b8d3..d507d2f48 100644 --- a/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/document_store.py +++ b/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/document_stores/mongodb_vcore/document_store.py @@ -2,10 +2,11 @@ import re from typing import Any, Dict, List, Optional, Union +from haystack import default_from_dict from haystack.dataclasses.document import Document from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy -from haystack.utils import Secret +from haystack.utils import Secret, deserialize_secrets_inplace from pymongo import InsertOne, MongoClient, ReplaceOne, UpdateOne from pymongo.collection import Collection from pymongo.driver_info import DriverInfo @@ -85,7 +86,7 @@ def mongo_client(self) -> MongoClient: if self._mongo_client is None: self._mongo_client = MongoClient( self.mongo_connection_string.resolve_value(), - appname="", + appname="HayStack-CDBMongoVCore-DocumentStore-Python", driver=DriverInfo(name="AzureCosmosDBMongoVCoreDocumentStore") ) return self._mongo_client @@ -122,6 +123,20 @@ def create_filter_index( create_index_response: dict[str, Any] = self.mongo_client[self.database_name].command(command) return create_index_response + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "AzureCosmosDBMongoVCoreDocumentStore": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + deserialize_secrets_inplace(data["init_parameters"], keys=["mongo_connection_string"]) + return default_from_dict(cls, data) + + def count_documents(self) -> int: """ Returns how many documents are present in the document store. diff --git a/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/filters.py b/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/document_stores/mongodb_vcore/filters.py similarity index 100% rename from integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/filters.py rename to integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/document_stores/mongodb_vcore/filters.py diff --git a/integrations/azure_cosmos_db/src/haystack_integrations/components/retrievers/nosql/__init__.py b/integrations/azure_cosmos_db_mongo_vcore/tests/__init__.py similarity index 100% rename from integrations/azure_cosmos_db/src/haystack_integrations/components/retrievers/nosql/__init__.py rename to integrations/azure_cosmos_db_mongo_vcore/tests/__init__.py diff --git a/integrations/azure_cosmos_db_nosql/CHANGELOG.md b/integrations/azure_cosmos_db_nosql/CHANGELOG.md new file mode 100644 index 000000000..94ba078b5 --- /dev/null +++ b/integrations/azure_cosmos_db_nosql/CHANGELOG.md @@ -0,0 +1,12 @@ +# Changelog + +## [integrations/azure_cosmos_db_mongo_vcore-v0.1.0] - 2024-09-26 + +### 🚀 Features + +- AzureCosmosDBMongoVCore Document Store (#413) +- `MongoDBAtlasEmbeddingRetriever` (#427) + +This PR will also push the docs to Readme + + diff --git a/integrations/azure_cosmos_db_nosql/LICENSE.txt b/integrations/azure_cosmos_db_nosql/LICENSE.txt new file mode 100644 index 000000000..137069b82 --- /dev/null +++ b/integrations/azure_cosmos_db_nosql/LICENSE.txt @@ -0,0 +1,73 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + + (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. + + You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/integrations/azure_cosmos_db_nosql/README.md b/integrations/azure_cosmos_db_nosql/README.md new file mode 100644 index 000000000..cb737b76f --- /dev/null +++ b/integrations/azure_cosmos_db_nosql/README.md @@ -0,0 +1,42 @@ +# mongodb-atlas-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/mongodb-atlas-haystack.svg)](https://pypi.org/project/mongodb-atlas-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mongodb-atlas-haystack.svg)](https://pypi.org/project/mongodb-atlas-haystack) + +----- + +**Table of Contents** + +- [Installation](#installation) +- [Contributing](#contributing) +- [License](#license) + +## Installation + +```console +pip install azure_cosmos_db_nosql-haystack +``` + +## Contributing + +`hatch` is the best way to interact with this project, to install it: +```sh +pip install hatch +``` + +To run the linters `ruff` and `mypy`: +``` +hatch run lint:all +``` + +To run all the tests: +``` +hatch run test +``` + +Note: you need your own Azure CosmosDB NoSql account to run the tests: you can make one here: +https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/vector-search. + +## License + +`mongodb-atlas-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/azure_cosmos_db_nosql/pyproject.toml b/integrations/azure_cosmos_db_nosql/pyproject.toml new file mode 100644 index 000000000..1821da96d --- /dev/null +++ b/integrations/azure_cosmos_db_nosql/pyproject.toml @@ -0,0 +1,155 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "azure_cosmos_db_nosql-haystack" +dynamic = ["version"] +description = "An integration of Azure CosmosDB NoSql with Haystack" +readme = "README.md" +requires-python = ">=3.8" +license = "Apache-2.0" +keywords = [] +authors = [{ name = "Aayush Kataria", email = "akataria@microsoft.com" }] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = ["haystack-ai", "azure-cosmos", "azure-identity"] + +[project.urls] +Source = "https://github.com/deepset-ai/haystack-core-integrations" +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/azure_cosmos_db_nosql/README.md" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/azure_cosmos_db_nosql-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/azure_cosmos_db_nosql-v[0-9]*"' + +[tool.hatch.envs.default] +dependencies = [ + "coverage[toml]>=6.5", + "pytest", + "pytest-rerunfailures", + "ipython", + "haystack-pydoc-tools", +] +[tool.hatch.envs.default.scripts] +test = "pytest {args:tests}" +test-cov = "coverage run -m pytest {args:tests}" +test-cov-retry = "test-cov --reruns 3 --reruns-delay 30 -x" +cov-report = ["- coverage combine", "coverage report"] +cov = ["test-cov", "cov-report"] +cov-retry = ["test-cov-retry", "cov-report"] +docs = ["pydoc-markdown pydoc/config.yml"] + +[[tool.hatch.envs.all.matrix]] +python = ["3.8", "3.9", "3.10", "3.11", "3.12"] + +[tool.hatch.envs.lint] +detached = true +dependencies = ["black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"] +[tool.hatch.envs.lint.scripts] +typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}" +style = ["ruff check {args:. --exclude tests/, examples/}", "black --check --diff {args:.}"] +fmt = ["black {args:.}", "ruff --fix {args:. --exclude tests/, examples/}", "style"] +all = ["style", "typing"] + +[tool.black] +target-version = ["py38"] +line-length = 120 +skip-string-normalization = true + +[tool.ruff] +target-version = "py38" +line-length = 120 +select = [ + "A", + "ARG", + "B", + "C", + "DTZ", + "E", + "EM", + "F", + "FBT", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", + "S106", + "S107", + # Ignore complexity + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.isort] +known-first-party = ["src"] + +[tool.ruff.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] +# examples can contain "print" commands +"examples/**/*" = ["T201"] + +[tool.coverage.run] +source = ["haystack_integrations"] +branch = true +parallel = false + + +[tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing = true +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + + +[[tool.mypy.overrides]] +module = ["haystack.*", "haystack_integrations.*", "pytest.*"] +ignore_missing_imports = true diff --git a/integrations/azure_cosmos_db_nosql/src/haystack_integrations/components/retrievers/nosql/__init__.py b/integrations/azure_cosmos_db_nosql/src/haystack_integrations/components/retrievers/nosql/__init__.py new file mode 100644 index 000000000..0ae889af7 --- /dev/null +++ b/integrations/azure_cosmos_db_nosql/src/haystack_integrations/components/retrievers/nosql/__init__.py @@ -0,0 +1,5 @@ +from integrations.azure_cosmos_db_nosql.src.haystack_integrations.components.retrievers.nosql.embedding_retriever import \ + AzureCosmosDBNoSqlEmbeddingRetriever + + +__all__ = ["AzureCosmosDBNoSqlEmbeddingRetriever"] \ No newline at end of file diff --git a/integrations/azure_cosmos_db_nosql/src/haystack_integrations/components/retrievers/nosql/embedding_retriever.py b/integrations/azure_cosmos_db_nosql/src/haystack_integrations/components/retrievers/nosql/embedding_retriever.py new file mode 100644 index 000000000..e0e047c01 --- /dev/null +++ b/integrations/azure_cosmos_db_nosql/src/haystack_integrations/components/retrievers/nosql/embedding_retriever.py @@ -0,0 +1,47 @@ +from typing import Any, Dict, List, Optional + +from haystack import component, default_from_dict +from haystack.dataclasses import Document + +from integrations.azure_cosmos_db_nosql.src.haystack_integrations.document_stores.nosql.document_store import AzureCosmosDBNoSqlDocumentStore + + +@component +class AzureCosmosDBNoSqlEmbeddingRetriever: + def __init__( + self, + *, + document_store: AzureCosmosDBNoSqlDocumentStore, + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + ): + if not isinstance(document_store, AzureCosmosDBNoSqlDocumentStore): + msg = "document_store must be an instance of AzureCosmosDBNoSqlDocumentStore" + raise ValueError(msg) + + self.document_store = document_store + self.filters = filters + self.top_k = top_k + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "AzureCosmosDBNoSqlDocumentStore": + data["init_parameters"]["document_store"] = AzureCosmosDBNoSqlDocumentStore.from_dict( + data["init_parameters"]["document_store"] + ) + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run( + self, + query_embedding: List[float], + filters: Optional[Dict[str, Any]] = None, + top_k: Optional[int] = None, + ) -> Dict[str, List[Document]]: + top_k = top_k or self.top_k + + docs = self.document_store._embedding_retrieval( + query_embedding=query_embedding, + filters=filters, + top_k=top_k, + ) + return {"documents": docs} diff --git a/integrations/azure_cosmos_db_nosql/src/haystack_integrations/document_stores/nosql/__init__.py b/integrations/azure_cosmos_db_nosql/src/haystack_integrations/document_stores/nosql/__init__.py new file mode 100644 index 000000000..191620800 --- /dev/null +++ b/integrations/azure_cosmos_db_nosql/src/haystack_integrations/document_stores/nosql/__init__.py @@ -0,0 +1,3 @@ +from .document_store import AzureCosmosDBNoSqlDocumentStore + +__all__ = ["AzureCosmosDBNoSqlDocumentStore"] diff --git a/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/nosql/document_store.py b/integrations/azure_cosmos_db_nosql/src/haystack_integrations/document_stores/nosql/document_store.py similarity index 74% rename from integrations/azure_cosmos_db/src/haystack_integrations/document_stores/nosql/document_store.py rename to integrations/azure_cosmos_db_nosql/src/haystack_integrations/document_stores/nosql/document_store.py index 7d0982e8b..770f5427d 100644 --- a/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/nosql/document_store.py +++ b/integrations/azure_cosmos_db_nosql/src/haystack_integrations/document_stores/nosql/document_store.py @@ -3,15 +3,18 @@ from azure.cosmos import CosmosClient from azure.cosmos.container import ContainerProxy +from azure.identity import ClientSecretCredential +from haystack import default_from_dict from haystack import Document from haystack.document_stores.types import DuplicatePolicy +from haystack.utils import Secret, deserialize_secrets_inplace logger = logging.getLogger(__name__) -class AzureCosmosNoSqlDocumentStore: +class AzureCosmosDBNoSqlDocumentStore: """ - AzureCosmosNoSqlDocumentStore is a DocumentStore implementation that uses [Azure CosmosDB NoSql] + AzureCosmosDBNoSqlDocumentStore is a DocumentStore implementation that uses [Azure CosmosDB NoSql] (https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/vector-search) service. To connect to Azure CosmosDB NoSql, you need to provide a cosmosClient. After providing the connection @@ -30,7 +33,7 @@ def __init__( cosmos_container_properties: dict[str, Any] = None, ): """ - Creates a new AzureCosmosNoSqlDocumentStore instance. + Creates a new AzureCosmosDBNoSqlDocumentStore instance. :param cosmos_client: Azure CosmosClient for NoSql account. :param database_name: Name of the database to use. @@ -69,6 +72,81 @@ def container(self) -> ContainerProxy: ) return self._container + @classmethod + def from_connection_string( + cls, + connection_string: Secret = Secret.from_env_var("AZURE_COSMOS_NOSQL_CONNECTION_STRING"), + database_name: str = None, + container_name: str = None, + vector_embedding_policy: dict[str, Any] = None, + indexing_policy: dict[str, Any] = None, + cosmos_container_properties: dict[str, Any] = None, + ) -> "AzureCosmosDBNoSqlDocumentStore": + cosmos_client = CosmosClient.from_connection_string(connection_string.resolve_value()) + return cls( + cosmos_client, + database_name, + container_name, + vector_embedding_policy, + indexing_policy, + cosmos_container_properties, + ) + + @classmethod + def from_uri_and_key( + cls, + uri: str, + key: str, + database_name: str = None, + container_name: str = None, + vector_embedding_policy: dict[str, Any] = None, + indexing_policy: dict[str, Any] = None, + cosmos_container_properties: dict[str, Any] = None, + ) -> "AzureCosmosDBNoSqlDocumentStore": + cosmos_client = CosmosClient(uri, key) + return cls( + cosmos_client, + database_name, + container_name, + vector_embedding_policy, + indexing_policy, + cosmos_container_properties, + ) + + @classmethod + def from_aad_token( + cls, + uri: str, + credential: ClientSecretCredential, + database_name: str = None, + container_name: str = None, + vector_embedding_policy: dict[str, Any] = None, + indexing_policy: dict[str, Any] = None, + cosmos_container_properties: dict[str, Any] = None, + ) -> "AzureCosmosDBNoSqlDocumentStore": + cosmos_client = CosmosClient(uri, credential) + return cls( + cosmos_client, + database_name, + container_name, + vector_embedding_policy, + indexing_policy, + cosmos_container_properties, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "AzureCosmosDBNoSqlDocumentStore": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + deserialize_secrets_inplace(data["init_parameters"], keys=["mongo_connection_string"]) + return default_from_dict(cls, data) + def count_documents(self) -> int: """ Returns how many documents are present in the document store. diff --git a/integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/__init__.py b/integrations/azure_cosmos_db_nosql/tests/__init__.py similarity index 100% rename from integrations/azure_cosmos_db/src/haystack_integrations/document_stores/mongodb_vcore/__init__.py rename to integrations/azure_cosmos_db_nosql/tests/__init__.py From c2eabe293c1216a0c7fd97d231833a010b87fc70 Mon Sep 17 00:00:00 2001 From: Aayush Kataria Date: Mon, 7 Oct 2024 18:41:28 -0700 Subject: [PATCH 3/3] Adding test cases --- .../retrievers/mongodb_vcore/__init__.py | 2 +- .../mongodb_vcore/document_store.py | 74 ++++++------ .../tests/test_document_store.py | 75 ++++++++++++ .../tests/test_embedding_retrieval.py | 88 ++++++++++++++ .../tests/test_utils.py | 43 +++++++ .../components/retrievers/nosql/__init__.py | 2 +- .../document_stores/nosql/document_store.py | 45 +++---- .../tests/test_document_store.py | 49 ++++++++ .../tests/test_embedding_retrieval.py | 112 ++++++++++++++++++ .../azure_cosmos_db_nosql/tests/test_utils.py | 67 +++++++++++ 10 files changed, 494 insertions(+), 63 deletions(-) create mode 100644 integrations/azure_cosmos_db_mongo_vcore/tests/test_document_store.py create mode 100644 integrations/azure_cosmos_db_mongo_vcore/tests/test_embedding_retrieval.py create mode 100644 integrations/azure_cosmos_db_mongo_vcore/tests/test_utils.py create mode 100644 integrations/azure_cosmos_db_nosql/tests/test_document_store.py create mode 100644 integrations/azure_cosmos_db_nosql/tests/test_embedding_retrieval.py create mode 100644 integrations/azure_cosmos_db_nosql/tests/test_utils.py diff --git a/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/components/retrievers/mongodb_vcore/__init__.py b/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/components/retrievers/mongodb_vcore/__init__.py index 183705271..1af6f241f 100644 --- a/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/components/retrievers/mongodb_vcore/__init__.py +++ b/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/components/retrievers/mongodb_vcore/__init__.py @@ -1,4 +1,4 @@ -from integrations.azure_cosmos_db_mongo_vcore.src.haystack_integrations.components.retrievers.mongodb_vcore.embedding_retriever import \ +from haystack_integrations.components.retrievers.mongodb_vcore.embedding_retriever import \ AzureCosmosDBMongoVCoreEmbeddingRetriever __all__ = ["AzureCosmosDBMongoVCoreEmbeddingRetriever"] diff --git a/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/document_stores/mongodb_vcore/document_store.py b/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/document_stores/mongodb_vcore/document_store.py index d507d2f48..4c9aef2c3 100644 --- a/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/document_stores/mongodb_vcore/document_store.py +++ b/integrations/azure_cosmos_db_mongo_vcore/src/haystack_integrations/document_stores/mongodb_vcore/document_store.py @@ -12,7 +12,7 @@ from pymongo.driver_info import DriverInfo from pymongo.errors import BulkWriteError -from haystack_integrations.document_stores.mongodb_vcore.filters import _normalize_filters +from integrations.azure_cosmos_db_mongo_vcore.src.haystack_integrations.document_stores.mongodb_vcore.filters import _normalize_filters logger = logging.getLogger(__name__) @@ -38,7 +38,7 @@ class AzureCosmosDBMongoVCoreDocumentStore: The last parameter users need to provide is a `vector_search_kwargs` - used for configs for vector search in mongo vCore. { - "vector_dimensions": 1536, + "dimensions": 1536, "num_lists": 1, "similarity": "COS", "kind": "vector-hnsw", @@ -78,34 +78,27 @@ def __init__( self.collection_name = collection_name self.vector_search_index_name = vector_search_index_name self.vector_search_kwargs = vector_search_kwargs - self._mongo_client = Optional[MongoClient] = None - self._collection = Optional[Collection] = None - - @property - def mongo_client(self) -> MongoClient: - if self._mongo_client is None: - self._mongo_client = MongoClient( - self.mongo_connection_string.resolve_value(), - appname="HayStack-CDBMongoVCore-DocumentStore-Python", - driver=DriverInfo(name="AzureCosmosDBMongoVCoreDocumentStore") - ) - return self._mongo_client - - @property - def collection(self) -> Collection: - if self._collection is None: - database = self.mongo_client[self.database_name] - if self.collection_name not in database.list_collection_names(): - # check the kind of vector search to be performed - # prepare the command accordingly - create_index_commands = {} - if self.vector_search_kwargs.get("kind") == "vector-ivf": - create_index_commands = self._get_vector_index_ivf() - elif self.vector_search_kwargs.get("kind") == "vector-hnsw": - create_index_commands = self._get_vector_index_hnsw() - database.command(create_index_commands) - self._collection = database[self.collection_name] - return self._collection + + self._mongo_client = MongoClient( + self.mongo_connection_string.resolve_value(), + appname="HayStack-CDBMongoVCore-DocumentStore-Python", + driver=DriverInfo(name="AzureCosmosDBMongoVCoreHayStackIntegration") + ) + + self._collection = self._create_collection_and_index() + + def _create_collection_and_index(self) -> Collection: + database = self._mongo_client[self.database_name] + if self.collection_name not in database.list_collection_names(): + # check the kind of vector search to be performed + # prepare the command accordingly + create_index_commands = {} + if self.vector_search_kwargs.get("kind") == "vector-ivf": + create_index_commands = self._get_vector_index_ivf() + elif self.vector_search_kwargs.get("kind") == "vector-hnsw": + create_index_commands = self._get_vector_index_hnsw() + database.command(create_index_commands) + return database[self.collection_name] def create_filter_index( self, property_to_filter: str, @@ -136,24 +129,25 @@ def from_dict(cls, data: Dict[str, Any]) -> "AzureCosmosDBMongoVCoreDocumentStor deserialize_secrets_inplace(data["init_parameters"], keys=["mongo_connection_string"]) return default_from_dict(cls, data) - def count_documents(self) -> int: """ Returns how many documents are present in the document store. :returns: The number of documents in the document store. """ - return self.collection.count_documents({}) + return self._collection.count_documents({}) - def delete_documents(self, document_ids: List[str]) -> None: + def delete_documents(self, document_ids: Optional[List[str]] = None, delete_all: Optional[bool] = None) -> None: """ Deletes all documents with a matching document_ids from the document store. :param document_ids: the document ids to delete + :param delete_all: if `True`, delete all documents. """ - if not document_ids: - return - self.collection.delete_many(filter={"id": {"$in": document_ids}}) + if document_ids is not None: + self._collection.delete_many(filter={"id": {"$in": document_ids}}) + elif delete_all: + self._collection.delete_many({}) def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int: """ @@ -193,7 +187,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D if policy == DuplicatePolicy.SKIP: operations = [UpdateOne({"id": doc["id"]}, {"$setOnInsert": doc}, upsert=True) for doc in mongo_documents] - existing_documents = self.collection.count_documents({"id": {"$in": [doc.id for doc in documents]}}) + existing_documents = self._collection.count_documents({"id": {"$in": [doc.id for doc in documents]}}) written_docs -= existing_documents elif policy == DuplicatePolicy.FAIL: operations = [InsertOne(doc) for doc in mongo_documents] @@ -201,7 +195,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D operations = [ReplaceOne({"id": doc["id"]}, upsert=True, replacement=doc) for doc in mongo_documents] try: - self.collection.bulk_write(operations) + self._collection.bulk_write(operations) except BulkWriteError as e: msg = f"Duplicate documents found: {e.details['writeErrors']}" raise DuplicateDocumentError(msg) from e @@ -219,7 +213,7 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc :returns: A list of Documents that match the given filters. """ filters = _normalize_filters(filters) if filters else None - documents = list(self.collection.find(filters)) + documents = list(self._collection.find(filters)) for doc in documents: doc.pop("_id", None) # MongoDB's internal id doesn't belong into a Haystack document, so we remove it. return [Document.from_dict(doc) for doc in documents] @@ -247,7 +241,7 @@ def _embedding_retrieval( pipeline = self._get_pipeline_vector_hnsw(query_embedding, top_k, filters) try: - documents = list(self.collection.aggregate(pipeline)) + documents = list(self._collection.aggregate(pipeline)) except Exception as e: msg = f"Retrieval of documents from MongoDB Atlas failed: {e}" if filters: diff --git a/integrations/azure_cosmos_db_mongo_vcore/tests/test_document_store.py b/integrations/azure_cosmos_db_mongo_vcore/tests/test_document_store.py new file mode 100644 index 000000000..1284ab992 --- /dev/null +++ b/integrations/azure_cosmos_db_mongo_vcore/tests/test_document_store.py @@ -0,0 +1,75 @@ +import os + +import pytest +from haystack.dataclasses.document import Document +from haystack.testing.document_store import DocumentStoreBaseTests +from haystack.utils import Secret + +from haystack_integrations.document_stores.mongodb_vcore import \ + AzureCosmosDBMongoVCoreDocumentStore + + +@pytest.mark.skipif( + "AZURE_COSMOS_MONGO_CONNECTION_STRING" not in os.environ, + reason="No Azure Cosmos DB connection string provided", +) +@pytest.mark.integration +class TestDocumentStore(DocumentStoreBaseTests): + @pytest.fixture + def document_store(self): + vector_search_kwargs = { + "dimensions": 768, + "num_lists": 1, + "similarity": "COS", + "kind": "vector-hnsw", + "m": 2, + "ef_construction": 64, + "ef_search": 40 + } + store = AzureCosmosDBMongoVCoreDocumentStore( + mongo_connection_string=Secret.from_env_var("AZURE_COSMOS_MONGO_CONNECTION_STRING"), + database_name="haystack_db", + collection_name="haystack_collection", + vector_search_index_name="haystack_index", + vector_search_kwargs=vector_search_kwargs, + ) + + yield store + + def test_write_document(self, document_store: AzureCosmosDBMongoVCoreDocumentStore): + docs = [Document(content="some text")] + assert document_store.write_documents(docs) == 1 + + def test_complex_filter(self, document_store, filterable_docs): + document_store.write_documents(filterable_docs) + filters = { + "operator": "OR", + "conditions": [ + { + "operator": "AND", + "conditions": [ + {"field": "meta.number", "operator": "==", "value": 100}, + {"field": "meta.chapter", "operator": "==", "value": "intro"}, + ], + }, + { + "operator": "AND", + "conditions": [ + {"field": "meta.page", "operator": "==", "value": "90"}, + {"field": "meta.chapter", "operator": "==", "value": "conclusion"}, + ], + }, + ], + } + + result = document_store.filter_documents(filters=filters) + + self.assert_documents_are_equal( + result, + [ + d + for d in filterable_docs + if (d.meta.get("number") == 100 and d.meta.get("chapter") == "intro") + or (d.meta.get("page") == "90" and d.meta.get("chapter") == "conclusion") + ], + ) diff --git a/integrations/azure_cosmos_db_mongo_vcore/tests/test_embedding_retrieval.py b/integrations/azure_cosmos_db_mongo_vcore/tests/test_embedding_retrieval.py new file mode 100644 index 000000000..9561c0e08 --- /dev/null +++ b/integrations/azure_cosmos_db_mongo_vcore/tests/test_embedding_retrieval.py @@ -0,0 +1,88 @@ +import os +from typing import List + +import pytest +from haystack.document_stores.errors.errors import DocumentStoreError +from haystack.utils import Secret + +from haystack_integrations.document_stores.mongodb_vcore import \ + AzureCosmosDBMongoVCoreDocumentStore +from test_utils import get_documents + +vector_search_kwargs = { + "dimensions": 768, + "num_lists": 1, + "similarity": "COS", + "kind": "vector-hnsw", + "m": 2, + "ef_construction": 64, + "ef_search": 40 +} + + +@pytest.mark.skipif( + "AZURE_COSMOS_MONGO_CONNECTION_STRING" not in os.environ, + reason="No Azure Cosmos DB connection string provided", +) +@pytest.mark.integration +class TestEmbeddingRetrieval: + def test_embedding_retrieval_cosine(self): + store = AzureCosmosDBMongoVCoreDocumentStore( + mongo_connection_string=Secret.from_env_var("AZURE_COSMOS_MONGO_CONNECTION_STRING"), + database_name="haystack_db", + collection_name="haystack_collection", + vector_search_index_name="haystack_index", + vector_search_kwargs=vector_search_kwargs, + ) + store.write_documents(get_documents()) + query_embedding = [0.1] * 768 + results = store._embedding_retrieval(query_embedding=query_embedding, top_k=2, filters={}) + assert len(results) == 2 + assert results[0].meta["similarityScore"] > results[1].meta["similarityScore"] + store.delete_documents(delete_all=True) + + def test_embedding_retrieval_euclidean(self): + vector_search_kwargs["similarity"] = "L2" + store = AzureCosmosDBMongoVCoreDocumentStore( + mongo_connection_string=Secret.from_env_var("AZURE_COSMOS_MONGO_CONNECTION_STRING"), + database_name="haystack_db", + collection_name="haystack_collection", + vector_search_index_name="haystack_index", + vector_search_kwargs=vector_search_kwargs, + ) + store.write_documents(get_documents()) + query_embedding = [0.1] * 768 + results = store._embedding_retrieval(query_embedding=query_embedding, top_k=2, filters={}) + assert len(results) == 2 + assert results[0].meta["similarityScore"] > results[1].meta["similarityScore"] + store.delete_documents(delete_all=True) + + def test_embedding_retrieval_inner_product(self): + vector_search_kwargs["similarity"] = "IP" + store = AzureCosmosDBMongoVCoreDocumentStore( + mongo_connection_string=Secret.from_env_var("AZURE_COSMOS_MONGO_CONNECTION_STRING"), + database_name="haystack_db", + collection_name="haystack_collection", + vector_search_index_name="haystack_index", + vector_search_kwargs=vector_search_kwargs, + ) + store.write_documents(get_documents()) + query_embedding = [0.1] * 768 + results = store._embedding_retrieval(query_embedding=query_embedding, top_k=2, filters={}) + assert len(results) == 2 + assert results[0].meta["similarityScore"] > results[1].meta["similarityScore"] + store.delete_documents(delete_all=True) + + def test_empty_query_embedding(self): + store = AzureCosmosDBMongoVCoreDocumentStore( + mongo_connection_string=Secret.from_env_var("AZURE_COSMOS_MONGO_CONNECTION_STRING"), + database_name="haystack_db", + collection_name="haystack_collection", + vector_search_index_name="haystack_index", + vector_search_kwargs=vector_search_kwargs, + ) + query_embedding: List[float] = [] + with pytest.raises(DocumentStoreError): + store._embedding_retrieval(query_embedding=query_embedding) + + diff --git a/integrations/azure_cosmos_db_mongo_vcore/tests/test_utils.py b/integrations/azure_cosmos_db_mongo_vcore/tests/test_utils.py new file mode 100644 index 000000000..1567a2ae5 --- /dev/null +++ b/integrations/azure_cosmos_db_mongo_vcore/tests/test_utils.py @@ -0,0 +1,43 @@ +import random +from typing import List + +from haystack.dataclasses import Document + + +def get_documents() -> List[Document]: + documents = [Document( + content=f"Document A", + meta={ + "name": f"name_A", + "page": "100", + "chapter": "intro", + "number": 2, + "date": "1969-07-21T20:17:40", + }, + embedding=_random_embeddings(768), + ), Document( + content=f"Document B", + meta={ + "name": f"name_B", + "page": "123", + "chapter": "abstract", + "number": -2, + "date": "1972-12-11T19:54:58", + }, + embedding=_random_embeddings(768), + ), Document( + content=f"Document C", + meta={ + "name": f"name_C", + "page": "90", + "chapter": "conclusion", + "number": -10, + "date": "1989-11-09T17:53:00", + }, + embedding=_random_embeddings(768), + )] + return documents + + +def _random_embeddings(n): + return [random.random() for _ in range(n)] diff --git a/integrations/azure_cosmos_db_nosql/src/haystack_integrations/components/retrievers/nosql/__init__.py b/integrations/azure_cosmos_db_nosql/src/haystack_integrations/components/retrievers/nosql/__init__.py index 0ae889af7..922c13c72 100644 --- a/integrations/azure_cosmos_db_nosql/src/haystack_integrations/components/retrievers/nosql/__init__.py +++ b/integrations/azure_cosmos_db_nosql/src/haystack_integrations/components/retrievers/nosql/__init__.py @@ -1,4 +1,4 @@ -from integrations.azure_cosmos_db_nosql.src.haystack_integrations.components.retrievers.nosql.embedding_retriever import \ +from haystack_integrations.components.retrievers.nosql.embedding_retriever import \ AzureCosmosDBNoSqlEmbeddingRetriever diff --git a/integrations/azure_cosmos_db_nosql/src/haystack_integrations/document_stores/nosql/document_store.py b/integrations/azure_cosmos_db_nosql/src/haystack_integrations/document_stores/nosql/document_store.py index 770f5427d..986fbbadf 100644 --- a/integrations/azure_cosmos_db_nosql/src/haystack_integrations/document_stores/nosql/document_store.py +++ b/integrations/azure_cosmos_db_nosql/src/haystack_integrations/document_stores/nosql/document_store.py @@ -6,6 +6,7 @@ from azure.identity import ClientSecretCredential from haystack import default_from_dict from haystack import Document +from haystack.document_stores.errors.errors import DocumentStoreError from haystack.document_stores.types import DuplicatePolicy from haystack.utils import Secret, deserialize_secrets_inplace @@ -56,20 +57,19 @@ def __init__( self.vector_embedding_policy = vector_embedding_policy self.indexing_policy = indexing_policy self.cosmos_container_properties = cosmos_container_properties - self._container = Optional[ContainerProxy] = None + self._container: Optional[ContainerProxy] = None @property def container(self) -> ContainerProxy: # Create the database if it already doesn't exist database = self.cosmos_client.create_database_if_not_exists(id=self.database_name) - if database.get_container_client(self.container_name).read() is not None: - # Create the collection if it already doesn't exist - self._container = database.create_container( - id=self.container_name, - partition_key=self.cosmos_container_properties["partition_key"], - indexing_policy=self.indexing_policy, - vector_embedding_policy=self.vector_embedding_policy, - ) + + self._container = database.create_container_if_not_exists( + id=self.container_name, + partition_key=self.cosmos_container_properties["partition_key"], + indexing_policy=self.indexing_policy, + vector_embedding_policy=self.vector_embedding_policy, + ) return self._container @classmethod @@ -162,7 +162,7 @@ def delete_documents(self, document_ids: List[str]) -> None: :param document_ids: the document ids to delete """ if not document_ids: - return + raise ValueError("document_ids cannot be empty") for document_id in document_ids: self.container.delete_item(document_id) @@ -217,30 +217,33 @@ def _embedding_retrieval( :raises ValueError: If `query_embedding` is empty. :raises DocumentStoreError: If the retrieval of documents from MongoDB Atlas fails. """ - embedding_key = self.vector_embedding_policy["vectorEmbeddings"][0]["path"][1:] query = "SELECT " # If limit_offset_clause is not specified, add TOP clause if filters is None or filters.get("limit_offset_clause") is None: query += "TOP @limit " query += ( - "c.id, c.@embeddingKey, c.content, c.meta, c.score, " - "VectorDistance(c.@embeddingKey, @embeddings) AS SimilarityScore FROM c" + "c.id, c.embedding, c.content, c.meta, c.score, " + "VectorDistance(c.embedding, @embeddings) AS SimilarityScore FROM c" ) # Add where_clause if specified if filters is not None and filters.get("where_clause") is not None: query += " {}".format(filters["where_clause"]) - query += " ORDER BY VectorDistance(c.@embeddingKey, @embeddings)" + query += " ORDER BY VectorDistance(c.embedding, @embeddings)" # Add limit_offset_clause if specified if filters is not None and filters.get("limit_offset_clause") is not None: query += " {}".format(filters["limit_offset_clause"]) parameters = [ {"name": "@limit", "value": top_k}, - {"name": "@embeddingKey", "value": embedding_key}, {"name": "@embeddings", "value": query_embedding}, ] - items = list(self.container.query_items(query, parameters=parameters, enable_cross_partition_query=True)) + try: + items = list(self.container.query_items(query, parameters=parameters, enable_cross_partition_query=True)) + except Exception as e: + msg = f"Retrieval of documents from Azure CosmosDB NoSQL failed: {str(e)}" + raise DocumentStoreError(msg) from e + nearest_results = [self._cosmos_doc_to_haystack_doc(item) for item in items] return nearest_results @@ -249,14 +252,14 @@ def _filtered_query(self, filters: Dict[str, Any]) -> str: return "SELECT * FROM c" query = "SELECT " - if filters["top"] is not None: + if "top" in filters: query += filters["top"] - query += "* FROM c" - if filters["where"] is not None: + query += "* FROM c " + if "where" in filters: query += filters["where"] - if filters["order_by"] is not None: + if "order_by" in filters: query += filters["order_by"] - if filters["limit_offset"] is not None and filters["top"] is None: + if "limit_offset" in filters and "top" not in filters: query += filters["limit_offset"] return query diff --git a/integrations/azure_cosmos_db_nosql/tests/test_document_store.py b/integrations/azure_cosmos_db_nosql/tests/test_document_store.py new file mode 100644 index 000000000..4def7bf91 --- /dev/null +++ b/integrations/azure_cosmos_db_nosql/tests/test_document_store.py @@ -0,0 +1,49 @@ +import os +from typing import Any, Dict + +import pytest +from azure.cosmos import PartitionKey +from haystack.dataclasses import Document +from haystack.testing.document_store import DocumentStoreBaseTests + +from haystack_integrations.document_stores.nosql import \ + AzureCosmosDBNoSqlDocumentStore +from test_utils import get_vector_embedding_policy, get_vector_indexing_policy + + +@pytest.mark.skipif( + "AZURE_COSMOS_NOSQL_CONNECTION_STRING" not in os.environ, + reason="No Azure Cosmos DB connection string provided", +) +@pytest.mark.integration +class TestDocumentStore(DocumentStoreBaseTests): + @pytest.fixture + def document_store(self): + store = AzureCosmosDBNoSqlDocumentStore.from_connection_string( + database_name="haystack_db", + container_name="haystack_container", + vector_embedding_policy=get_vector_embedding_policy("float32", 768, "cosine"), + indexing_policy=get_vector_indexing_policy("quantized_flat"), + cosmos_container_properties={"partition_key": PartitionKey(path="/id")}, + ) + yield store + + def test_write_documents(self, document_store: AzureCosmosDBNoSqlDocumentStore): + docs = [Document(content="some text")] + assert document_store.write_documents(docs) == 1 + + def test_complex_filter(self, document_store, filterable_docs): + document_store.write_documents(filterable_docs) + filters = { + "where": "WHERE c.meta.number=100 and c.meta.chapter='intro'" + } + result = document_store.filter_documents(filters=filters) + self.assert_documents_are_equal( + result, + [ + d + for d in filterable_docs + if (d.meta.get("number") == 100) + ], + ) + document_store.delete_documents(delete_all=True) diff --git a/integrations/azure_cosmos_db_nosql/tests/test_embedding_retrieval.py b/integrations/azure_cosmos_db_nosql/tests/test_embedding_retrieval.py new file mode 100644 index 000000000..3c5de2164 --- /dev/null +++ b/integrations/azure_cosmos_db_nosql/tests/test_embedding_retrieval.py @@ -0,0 +1,112 @@ +import os +from typing import Any, Dict, List + +import pytest +from azure.cosmos import PartitionKey +from haystack.document_stores.errors.errors import DocumentStoreError + +from haystack_integrations.document_stores.nosql import \ + AzureCosmosDBNoSqlDocumentStore +from test_utils import get_documents, get_vector_embedding_policy, get_vector_indexing_policy + + +@pytest.mark.skipif( + "AZURE_COSMOS_NOSQL_CONNECTION_STRING" not in os.environ, + reason="No Azure Cosmos DB connection string provided", +) +class TestEmbeddingRetrieval: + def test_embedding_retrieval_cosine_flat(self): + store = AzureCosmosDBNoSqlDocumentStore.from_connection_string( + database_name="haystack_db", + container_name="haystack_container", + vector_embedding_policy=get_vector_embedding_policy("float32", 768, "cosine"), + indexing_policy=get_vector_indexing_policy("flat"), + cosmos_container_properties={"partition_key": PartitionKey(path="/id")}, + ) + store.write_documents(get_documents()) + query_embedding = [0.1] * 768 + results = store._embedding_retrieval(query_embedding=query_embedding, top_k=2, filters={}) + assert len(results) == 2 + assert results[0].meta["SimilarityScore"] > results[1].meta["SimilarityScore"] + + def test_embedding_retrieval_euclidean_flat(self): + store = AzureCosmosDBNoSqlDocumentStore.from_connection_string( + database_name="haystack_db", + container_name="haystack_container", + vector_embedding_policy=get_vector_embedding_policy("float32", 768, "euclidean"), + indexing_policy=get_vector_indexing_policy("flat"), + cosmos_container_properties={"partition_key": PartitionKey(path="/id")}, + ) + store.write_documents(get_documents()) + query_embedding = [0.1] * 768 + results = store._embedding_retrieval(query_embedding=query_embedding, top_k=2, filters={}) + assert len(results) == 2 + assert results[0].meta["SimilarityScore"] > results[1].meta["SimilarityScore"] + + def test_embedding_retrieval_dot_product_flat(self): + store = AzureCosmosDBNoSqlDocumentStore.from_connection_string( + database_name="haystack_db", + container_name="haystack_container", + vector_embedding_policy=get_vector_embedding_policy("float32", 768, "dotProduct"), + indexing_policy=get_vector_indexing_policy("flat"), + cosmos_container_properties={"partition_key": PartitionKey(path="/id")}, + ) + store.write_documents(get_documents()) + query_embedding = [0.1] * 768 + results = store._embedding_retrieval(query_embedding=query_embedding, top_k=2, filters={}) + assert len(results) == 2 + assert results[0].meta["SimilarityScore"] > results[1].meta["SimilarityScore"] + + def test_embedding_retrieval_cosine_quantized_flat(self): + store = AzureCosmosDBNoSqlDocumentStore.from_connection_string( + database_name="haystack_db", + container_name="haystack_container", + vector_embedding_policy=get_vector_embedding_policy("float32", 768, "cosine"), + indexing_policy=get_vector_indexing_policy("quantized_flat"), + cosmos_container_properties={"partition_key": PartitionKey(path="/id")}, + ) + store.write_documents(get_documents()) + query_embedding = [0.1] * 768 + results = store._embedding_retrieval(query_embedding=query_embedding, top_k=2, filters={}) + assert len(results) == 2 + assert results[0].meta["SimilarityScore"] > results[1].meta["SimilarityScore"] + + def test_embedding_retrieval_euclidean_quantized_flat(self): + store = AzureCosmosDBNoSqlDocumentStore.from_connection_string( + database_name="haystack_db", + container_name="haystack_container", + vector_embedding_policy=get_vector_embedding_policy("float32", 768, "euclidean"), + indexing_policy=get_vector_indexing_policy("quantized_flat"), + cosmos_container_properties={"partition_key": PartitionKey(path="/id")}, + ) + store.write_documents(get_documents()) + query_embedding = [0.1] * 768 + results = store._embedding_retrieval(query_embedding=query_embedding, top_k=2, filters={}) + assert len(results) == 2 + assert results[0].meta["SimilarityScore"] > results[1].meta["SimilarityScore"] + + def test_embedding_retrieval_dot_product_quantized_flat(self): + store = AzureCosmosDBNoSqlDocumentStore.from_connection_string( + database_name="haystack_db", + container_name="haystack_container", + vector_embedding_policy=get_vector_embedding_policy("float32", 768, "dotProduct"), + indexing_policy=get_vector_indexing_policy("quantized_flat"), + cosmos_container_properties={"partition_key": PartitionKey(path="/id")}, + ) + store.write_documents(get_documents()) + query_embedding = [0.1] * 768 + results = store._embedding_retrieval(query_embedding=query_embedding, top_k=2, filters={}) + assert len(results) == 2 + assert results[0].meta["SimilarityScore"] > results[1].meta["SimilarityScore"] + + def test_empty_query_embedding(self): + store = AzureCosmosDBNoSqlDocumentStore.from_connection_string( + database_name="haystack_db", + container_name="haystack_container", + vector_embedding_policy=get_vector_embedding_policy("float32", 768, "dotProduct"), + indexing_policy=get_vector_indexing_policy("quantized_flat"), + cosmos_container_properties={"partition_key": PartitionKey(path="/id")}, + ) + query_embedding: List[float] = [] + with pytest.raises(DocumentStoreError): + store._embedding_retrieval(query_embedding=query_embedding) diff --git a/integrations/azure_cosmos_db_nosql/tests/test_utils.py b/integrations/azure_cosmos_db_nosql/tests/test_utils.py new file mode 100644 index 000000000..92bda9c8c --- /dev/null +++ b/integrations/azure_cosmos_db_nosql/tests/test_utils.py @@ -0,0 +1,67 @@ +import random +from typing import Any, Dict, List + +from haystack.dataclasses import Document + + +def get_vector_indexing_policy(embedding_type: str) -> Dict[str, Any]: + return { + "indexingMode": "consistent", + "includedPaths": [{"path": "/*"}], + "excludedPaths": [{"path": '/"_etag"/?'}], + "vectorIndexes": [{"path": "/embedding", "type": embedding_type}], + } + + +def get_vector_embedding_policy( + data_type: str, dimensions: int, distance_function: str +) -> Dict[str, Any]: + return { + "vectorEmbeddings": [ + { + "path": "/embedding", + "dataType": data_type, + "dimensions": dimensions, + "distanceFunction": distance_function, + } + ] + } + + +def get_documents() -> List[Document]: + documents = [Document( + content=f"Document A", + meta={ + "name": f"name_A", + "page": "100", + "chapter": "intro", + "number": 2, + "date": "1969-07-21T20:17:40", + }, + embedding=_random_embeddings(768), + ), Document( + content=f"Document B", + meta={ + "name": f"name_B", + "page": "123", + "chapter": "abstract", + "number": -2, + "date": "1972-12-11T19:54:58", + }, + embedding=_random_embeddings(768), + ), Document( + content=f"Document C", + meta={ + "name": f"name_C", + "page": "90", + "chapter": "conclusion", + "number": -10, + "date": "1989-11-09T17:53:00", + }, + embedding=_random_embeddings(768), + )] + return documents + + +def _random_embeddings(n): + return [random.random() for _ in range(n)]