Merge branch 'main' into document-language-classifier

deepset-ai · Oct 13, 2023 · f1b9ecd · f1b9ecd
2 parents 2837d69 + 988fa61
commit f1b9ecd
Show file tree

Hide file tree

Showing 14 changed files with 125 additions and 43 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -124,10 +124,10 @@ jobs:
         include:
           - topic: document_stores
             os: ubuntu-latest
-            dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,onnx,beir,metrics,aws,dev
+            dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,crawler,preprocessing,file-conversion,pdf,ocr,metrics,dev
           - topic: document_stores
             os: windows-latest
-            dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,onnx,beir,metrics,aws,dev
+            dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,crawler,preprocessing,file-conversion,pdf,ocr,metrics,dev
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
@@ -329,7 +329,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     services:
       elasticsearch:
-        image: elasticsearch:8.8.0
+        image: elasticsearch:8.10.2
         env:
           discovery.type: "single-node"
           xpack.security.enabled: "false"
@@ -346,9 +346,36 @@ jobs:
       - name: Install Haystack
         run: pip install .[elasticsearch8,dev,preprocessing,inference]
 
+      - name: Make elasticsearch comfortable with a disk almost full
+        run: |
+          curl -X PUT "localhost:9200/_cluster/settings?pretty" -H 'Content-Type: application/json' -d'
+          {
+            "persistent": {
+              "cluster.routing.allocation.disk.watermark.low": "90%",
+              "cluster.routing.allocation.disk.watermark.low.max_headroom": "100GB",
+              "cluster.routing.allocation.disk.watermark.high": "95%",
+              "cluster.routing.allocation.disk.watermark.high.max_headroom": "20GB",
+              "cluster.routing.allocation.disk.watermark.flood_stage": "97%",
+              "cluster.routing.allocation.disk.watermark.flood_stage.max_headroom": "5GB",
+              "cluster.routing.allocation.disk.watermark.flood_stage.frozen": "97%",
+              "cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom": "5GB"
+            }
+          }
+          '
+          curl -X PUT "localhost:9200/*/_settings?expand_wildcards=all&pretty" -H 'Content-Type: application/json' -d'
+          {
+            "index.blocks.read_only_allow_delete": null
+          }
+          '
+
       - name: Run tests
         run: |
-          pytest --maxfail=5 -m "document_store and integration" test/document_stores/test_elasticsearch.py
+          pytest -x -m"document_store and integration" test/document_stores/test_elasticsearch.py
+
+      - name: logs
+        if: failure()
+        run: |
+          docker logs "${{ job.services.elasticsearch.id }}"
 
       - name: Calculate alert data
         id: calculator

diff --git a/examples/test_getting_started.py b/examples/test_getting_started.py
@@ -6,7 +6,7 @@
 from haystack.schema import Answer, Document
 
 
-@pytest.mark.parametrize("provider", ["anthropic", "cohere", "huggingface", "openai"])
+@pytest.mark.parametrize("provider", ["cohere", "huggingface", "openai"])
 def test_getting_started(provider):
     if provider == "anthropic":
         api_key = os.environ.get("ANTHROPIC_API_KEY", "")

diff --git a/haystack/nodes/prompt/prompt_model.py b/haystack/nodes/prompt/prompt_model.py
@@ -98,6 +98,8 @@ def create_invocation_layer(
             f" Currently supported invocation layers are: {PromptModelInvocationLayer.invocation_layer_providers}"
             f" You can implement and provide custom invocation layer for {self.model_name_or_path} by subclassing "
             "PromptModelInvocationLayer."
+            f" Also please ensure you are authorised to load the model {self.model_name_or_path} and you are "
+            "logged-in into the huggingface cli."
         )
 
     def invoke(self, prompt: Union[str, List[str], List[Dict[str, str]]], **kwargs) -> List[str]:

diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py
@@ -963,7 +963,7 @@ def eval_beir(
             from beir.datasets.data_loader import GenericDataLoader
             from beir.retrieval.evaluation import EvaluateRetrieval
         except ModuleNotFoundError as e:
-            raise HaystackError("beir is not installed. Please run `pip install farm-haystack[beir]`...") from e
+            raise HaystackError("beir is not installed. Please run `pip install beir`") from e
 
         url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
         data_path = util.download_and_unzip(url, dataset_dir)

diff --git a/...ck/preview/embedding_backends/__init__.py → ...components/embedders/backends/__init__.py b/...ck/preview/embedding_backends/__init__.py → ...components/embedders/backends/__init__.py
diff --git a/...backends/sentence_transformers_backend.py → ...backends/sentence_transformers_backend.py b/...backends/sentence_transformers_backend.py → ...backends/sentence_transformers_backend.py
diff --git a/haystack/preview/components/embedders/sentence_transformers_document_embedder.py b/haystack/preview/components/embedders/sentence_transformers_document_embedder.py
@@ -1,7 +1,7 @@
 from typing import List, Optional, Union, Dict, Any
 
 from haystack.preview import component, Document, default_to_dict, default_from_dict
-from haystack.preview.embedding_backends.sentence_transformers_backend import (
+from haystack.preview.components.embedders.backends.sentence_transformers_backend import (
     _SentenceTransformersEmbeddingBackendFactory,
 )
 
@@ -29,11 +29,13 @@ def __init__(
         """
         Create a SentenceTransformersDocumentEmbedder component.
 
-        :param model_name_or_path: Local path or name of the model in Hugging Face's model hub, such as ``'sentence-transformers/all-mpnet-base-v2'``.
-        :param device: Device (like 'cuda' / 'cpu') that should be used for computation. If None, checks if a GPU can be used.
+        :param model_name_or_path: Local path or name of the model in Hugging Face's model hub,
+            such as ``'sentence-transformers/all-mpnet-base-v2'``.
+        :param device: Device (like 'cuda' / 'cpu') that should be used for computation.
+            Defaults to CPU.
         :param use_auth_token: The API token used to download private models from Hugging Face.
-                        If this parameter is set to `True`, then the token generated when running
-                        `transformers-cli login` (stored in ~/.huggingface) will be used.
+            If this parameter is set to `True`, then the token generated when running
+            `transformers-cli login` (stored in ~/.huggingface) will be used.
         :param prefix: A string to add to the beginning of each Document text before embedding.
         :param suffix: A string to add to the end of each Document text before embedding.
         :param batch_size: Number of strings to encode at once.
@@ -95,7 +97,7 @@ def run(self, documents: List[Document]):
         Embed a list of Documents.
         The embedding of each Document is stored in the `embedding` field of the Document.
         """
-        if not isinstance(documents, list) or not isinstance(documents[0], Document):
+        if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
             raise TypeError(
                 "SentenceTransformersDocumentEmbedder expects a list of Documents as input."
                 "In case you want to embed a list of strings, please use the SentenceTransformersTextEmbedder."

diff --git a/haystack/preview/components/embedders/sentence_transformers_text_embedder.py b/haystack/preview/components/embedders/sentence_transformers_text_embedder.py
@@ -1,7 +1,7 @@
 from typing import List, Optional, Union, Dict, Any
 
 from haystack.preview import component, default_to_dict, default_from_dict
-from haystack.preview.embedding_backends.sentence_transformers_backend import (
+from haystack.preview.components.embedders.backends.sentence_transformers_backend import (
     _SentenceTransformersEmbeddingBackendFactory,
 )
 
@@ -26,11 +26,13 @@ def __init__(
         """
         Create a SentenceTransformersTextEmbedder component.
 
-        :param model_name_or_path: Local path or name of the model in Hugging Face's model hub, such as ``'sentence-transformers/all-mpnet-base-v2'``.
-        :param device: Device (like 'cuda' / 'cpu') that should be used for computation. If None, checks if a GPU can be used.
+        :param model_name_or_path: Local path or name of the model in Hugging Face's model hub,
+            such as ``'sentence-transformers/all-mpnet-base-v2'``.
+        :param device: Device (like 'cuda' / 'cpu') that should be used for computation.
+            Defaults to CPU.
         :param use_auth_token: The API token used to download private models from Hugging Face.
-                        If this parameter is set to `True`, then the token generated when running
-                        `transformers-cli login` (stored in ~/.huggingface) will be used.
+            If this parameter is set to `True`, then the token generated when running
+            `transformers-cli login` (stored in ~/.huggingface) will be used.
         :param prefix: A string to add to the beginning of each text.
         :param suffix: A string to add to the end of each text.
         :param batch_size: Number of strings to encode at once.

diff --git a/haystack/preview/components/rankers/similarity.py b/haystack/preview/components/rankers/similarity.py
@@ -1,6 +1,6 @@
 import logging
 from pathlib import Path
-from typing import List, Union, Dict, Any
+from typing import List, Union, Dict, Any, Optional
 
 from haystack.preview import ComponentError, Document, component, default_from_dict, default_to_dict
 from haystack.preview.lazy_imports import LazyImport
@@ -34,17 +34,24 @@ class SimilarityRanker:
     """
 
     def __init__(
-        self, model_name_or_path: Union[str, Path] = "cross-encoder/ms-marco-MiniLM-L-6-v2", device: str = "cpu"
+        self,
+        model_name_or_path: Union[str, Path] = "cross-encoder/ms-marco-MiniLM-L-6-v2",
+        top_k: int = 10,
+        device: str = "cpu",
     ):
         """
         Creates an instance of SimilarityRanker.
 
         :param model_name_or_path: Path to a pre-trained sentence-transformers model.
+        :param top_k: The maximum number of documents to return per query.
         :param device: torch device (for example, cuda:0, cpu, mps) to limit model inference to a specific device.
         """
         torch_and_transformers_import.check()
 
         self.model_name_or_path = model_name_or_path
+        if top_k <= 0:
+            raise ValueError(f"top_k must be > 0, but got {top_k}")
+        self.top_k = top_k
         self.device = device
         self.model = None
         self.tokenizer = None
@@ -63,7 +70,7 @@ def to_dict(self) -> Dict[str, Any]:
         """
         Serialize this component to a dictionary.
         """
-        return default_to_dict(self, device=self.device, model_name_or_path=self.model_name_or_path)
+        return default_to_dict(self, top_k=self.top_k, device=self.device, model_name_or_path=self.model_name_or_path)
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "SimilarityRanker":
@@ -73,17 +80,24 @@ def from_dict(cls, data: Dict[str, Any]) -> "SimilarityRanker":
         return default_from_dict(cls, data)
 
     @component.output_types(documents=List[Document])
-    def run(self, query: str, documents: List[Document]):
+    def run(self, query: str, documents: List[Document], top_k: Optional[int] = None):
         """
         Returns a list of documents ranked by their similarity to the given query
 
         :param query: Query string.
         :param documents: List of Documents.
+        :param top_k: The maximum number of documents to return.
         :return: List of Documents sorted by (desc.) similarity with the query.
         """
         if not documents:
             return {"documents": []}
 
+        if top_k is None:
+            top_k = self.top_k
+
+        elif top_k <= 0:
+            raise ValueError(f"top_k must be > 0, but got {top_k}")
+
         # If a model path is provided but the model isn't loaded
         if self.model_name_or_path and not self.model:
             raise ComponentError(
@@ -105,4 +119,4 @@ def run(self, query: str, documents: List[Document]):
             i = sorted_index_tensor.item()
             documents[i].score = similarity_scores[i].item()
             ranked_docs.append(documents[i])
-        return {"documents": ranked_docs}
+        return {"documents": ranked_docs[:top_k]}
diff --git a/pyproject.toml b/pyproject.toml
@@ -152,9 +152,6 @@ docstores-gpu = [
 audio = [
   "openai-whisper"
 ]
-beir = [
-  "beir; platform_system != 'Windows'",
-]
 aws = [
   "boto3",
   # Costraint botocore to avoid taking to much time to resolve the dependency tree.
@@ -246,11 +243,11 @@ formatting = [
 ]
 
 all = [
-  "farm-haystack[inference,docstores,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,onnx,beir,metrics,aws,preview]",
+  "farm-haystack[inference,docstores,crawler,preprocessing,file-conversion,pdf,ocr,metrics,aws,preview]",
 ]
 all-gpu = [
   # beir is incompatible with faiss-gpu: https://github.com/beir-cellar/beir/issues/71
-  "farm-haystack[inference,docstores-gpu,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,onnx-gpu,metrics,aws,preview]",
+  "farm-haystack[inference,docstores-gpu,crawler,preprocessing,file-conversion,pdf,ocr,metrics,aws,preview]",
 ]
 
 [project.scripts]

diff --git a/releasenotes/notes/review-all-extras-42d5a3a3d61f5393.yaml b/releasenotes/notes/review-all-extras-42d5a3a3d61f5393.yaml
@@ -0,0 +1,3 @@
+upgrade:
+  - |
+    Removes the `audio`, `ray`, `onnx` and `beir` extras from the extra group `all`.
diff --git a/test/pipelines/test_ray.py b/test/pipelines/test_ray.py
@@ -1,17 +1,15 @@
-from pathlib import Path
-
 import pytest
-import ray
 
 from haystack.pipelines import RayPipeline
 
 
-@pytest.fixture(autouse=True)
-def shutdown_ray():
-    yield
+@pytest.fixture()
+def ray():
     try:
         import ray
 
+        yield ray
+
         ray.serve.shutdown()
         ray.shutdown()
     except:
@@ -21,7 +19,7 @@ def shutdown_ray():
 @pytest.mark.integration
 @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
 @pytest.mark.parametrize("serve_detached", [True, False])
-def test_load_pipeline(document_store_with_docs, serve_detached, samples_path):
+def test_load_pipeline(ray, document_store_with_docs, serve_detached, samples_path):
     pipeline = RayPipeline.load_from_yaml(
         samples_path / "pipeline" / "ray.simple.haystack-pipeline.yml",
         pipeline_name="ray_query_pipeline",
@@ -43,7 +41,7 @@ def test_load_pipeline(document_store_with_docs, serve_detached, samples_path):
 
 @pytest.mark.integration
 @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
-def test_load_advanced_pipeline(document_store_with_docs, samples_path):
+def test_load_advanced_pipeline(ray, document_store_with_docs, samples_path):
     pipeline = RayPipeline.load_from_yaml(
         samples_path / "pipeline" / "ray.advanced.haystack-pipeline.yml",
         pipeline_name="ray_query_pipeline",
@@ -71,7 +69,7 @@ def test_load_advanced_pipeline(document_store_with_docs, samples_path):
 @pytest.mark.asyncio
 @pytest.mark.integration
 @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
-async def test_load_advanced_pipeline_async(document_store_with_docs, samples_path):
+async def test_load_advanced_pipeline_async(ray, document_store_with_docs, samples_path):
     pipeline = RayPipeline.load_from_yaml(
         samples_path / "pipeline" / "ray.advanced.haystack-pipeline.yml",
         pipeline_name="ray_query_pipeline",

diff --git a/...ng_backends/test_sentence_transformers.py → ...entence_transformers_embedding_backend.py b/...ng_backends/test_sentence_transformers.py → ...entence_transformers_embedding_backend.py
@@ -1,12 +1,12 @@
 from unittest.mock import patch
 import pytest
-from haystack.preview.embedding_backends.sentence_transformers_backend import (
+from haystack.preview.components.embedders.backends.sentence_transformers_backend import (
     _SentenceTransformersEmbeddingBackendFactory,
 )
 
 
 @pytest.mark.unit
-@patch("haystack.preview.embedding_backends.sentence_transformers_backend.SentenceTransformer")
+@patch("haystack.preview.components.embedders.backends.sentence_transformers_backend.SentenceTransformer")
 def test_factory_behavior(mock_sentence_transformer):
     embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(
         model_name_or_path="my_model", device="cpu"
@@ -21,7 +21,7 @@ def test_factory_behavior(mock_sentence_transformer):
 
 
 @pytest.mark.unit
-@patch("haystack.preview.embedding_backends.sentence_transformers_backend.SentenceTransformer")
+@patch("haystack.preview.components.embedders.backends.sentence_transformers_backend.SentenceTransformer")
 def test_model_initialization(mock_sentence_transformer):
     _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(
         model_name_or_path="model", device="cpu", use_auth_token="my_token"
@@ -32,7 +32,7 @@ def test_model_initialization(mock_sentence_transformer):
 
 
 @pytest.mark.unit
-@patch("haystack.preview.embedding_backends.sentence_transformers_backend.SentenceTransformer")
+@patch("haystack.preview.components.embedders.backends.sentence_transformers_backend.SentenceTransformer")
 def test_embedding_function_with_kwargs(mock_sentence_transformer):
     embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(model_name_or_path="model")