Skip to content

Commit

Permalink
Merge branch 'main' into document-language-classifier
Browse files Browse the repository at this point in the history
  • Loading branch information
julian-risch authored Oct 13, 2023
2 parents 2837d69 + 988fa61 commit f1b9ecd
Show file tree
Hide file tree
Showing 14 changed files with 125 additions and 43 deletions.
35 changes: 31 additions & 4 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,10 @@ jobs:
include:
- topic: document_stores
os: ubuntu-latest
dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,onnx,beir,metrics,aws,dev
dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,crawler,preprocessing,file-conversion,pdf,ocr,metrics,dev
- topic: document_stores
os: windows-latest
dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,onnx,beir,metrics,aws,dev
dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,crawler,preprocessing,file-conversion,pdf,ocr,metrics,dev
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -329,7 +329,7 @@ jobs:
runs-on: ${{ matrix.os }}
services:
elasticsearch:
image: elasticsearch:8.8.0
image: elasticsearch:8.10.2
env:
discovery.type: "single-node"
xpack.security.enabled: "false"
Expand All @@ -346,9 +346,36 @@ jobs:
- name: Install Haystack
run: pip install .[elasticsearch8,dev,preprocessing,inference]

- name: Make elasticsearch comfortable with a disk almost full
run: |
curl -X PUT "localhost:9200/_cluster/settings?pretty" -H 'Content-Type: application/json' -d'
{
"persistent": {
"cluster.routing.allocation.disk.watermark.low": "90%",
"cluster.routing.allocation.disk.watermark.low.max_headroom": "100GB",
"cluster.routing.allocation.disk.watermark.high": "95%",
"cluster.routing.allocation.disk.watermark.high.max_headroom": "20GB",
"cluster.routing.allocation.disk.watermark.flood_stage": "97%",
"cluster.routing.allocation.disk.watermark.flood_stage.max_headroom": "5GB",
"cluster.routing.allocation.disk.watermark.flood_stage.frozen": "97%",
"cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom": "5GB"
}
}
'
curl -X PUT "localhost:9200/*/_settings?expand_wildcards=all&pretty" -H 'Content-Type: application/json' -d'
{
"index.blocks.read_only_allow_delete": null
}
'
- name: Run tests
run: |
pytest --maxfail=5 -m "document_store and integration" test/document_stores/test_elasticsearch.py
pytest -x -m"document_store and integration" test/document_stores/test_elasticsearch.py
- name: logs
if: failure()
run: |
docker logs "${{ job.services.elasticsearch.id }}"
- name: Calculate alert data
id: calculator
Expand Down
2 changes: 1 addition & 1 deletion examples/test_getting_started.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from haystack.schema import Answer, Document


@pytest.mark.parametrize("provider", ["anthropic", "cohere", "huggingface", "openai"])
@pytest.mark.parametrize("provider", ["cohere", "huggingface", "openai"])
def test_getting_started(provider):
if provider == "anthropic":
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
Expand Down
2 changes: 2 additions & 0 deletions haystack/nodes/prompt/prompt_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ def create_invocation_layer(
f" Currently supported invocation layers are: {PromptModelInvocationLayer.invocation_layer_providers}"
f" You can implement and provide custom invocation layer for {self.model_name_or_path} by subclassing "
"PromptModelInvocationLayer."
f" Also please ensure you are authorised to load the model {self.model_name_or_path} and you are "
"logged-in into the huggingface cli."
)

def invoke(self, prompt: Union[str, List[str], List[Dict[str, str]]], **kwargs) -> List[str]:
Expand Down
2 changes: 1 addition & 1 deletion haystack/pipelines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -963,7 +963,7 @@ def eval_beir(
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
except ModuleNotFoundError as e:
raise HaystackError("beir is not installed. Please run `pip install farm-haystack[beir]`...") from e
raise HaystackError("beir is not installed. Please run `pip install beir`") from e

url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
data_path = util.download_and_unzip(url, dataset_dir)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import List, Optional, Union, Dict, Any

from haystack.preview import component, Document, default_to_dict, default_from_dict
from haystack.preview.embedding_backends.sentence_transformers_backend import (
from haystack.preview.components.embedders.backends.sentence_transformers_backend import (
_SentenceTransformersEmbeddingBackendFactory,
)

Expand Down Expand Up @@ -29,11 +29,13 @@ def __init__(
"""
Create a SentenceTransformersDocumentEmbedder component.
:param model_name_or_path: Local path or name of the model in Hugging Face's model hub, such as ``'sentence-transformers/all-mpnet-base-v2'``.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation. If None, checks if a GPU can be used.
:param model_name_or_path: Local path or name of the model in Hugging Face's model hub,
such as ``'sentence-transformers/all-mpnet-base-v2'``.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation.
Defaults to CPU.
:param use_auth_token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
:param prefix: A string to add to the beginning of each Document text before embedding.
:param suffix: A string to add to the end of each Document text before embedding.
:param batch_size: Number of strings to encode at once.
Expand Down Expand Up @@ -95,7 +97,7 @@ def run(self, documents: List[Document]):
Embed a list of Documents.
The embedding of each Document is stored in the `embedding` field of the Document.
"""
if not isinstance(documents, list) or not isinstance(documents[0], Document):
if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
raise TypeError(
"SentenceTransformersDocumentEmbedder expects a list of Documents as input."
"In case you want to embed a list of strings, please use the SentenceTransformersTextEmbedder."
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import List, Optional, Union, Dict, Any

from haystack.preview import component, default_to_dict, default_from_dict
from haystack.preview.embedding_backends.sentence_transformers_backend import (
from haystack.preview.components.embedders.backends.sentence_transformers_backend import (
_SentenceTransformersEmbeddingBackendFactory,
)

Expand All @@ -26,11 +26,13 @@ def __init__(
"""
Create a SentenceTransformersTextEmbedder component.
:param model_name_or_path: Local path or name of the model in Hugging Face's model hub, such as ``'sentence-transformers/all-mpnet-base-v2'``.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation. If None, checks if a GPU can be used.
:param model_name_or_path: Local path or name of the model in Hugging Face's model hub,
such as ``'sentence-transformers/all-mpnet-base-v2'``.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation.
Defaults to CPU.
:param use_auth_token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
:param prefix: A string to add to the beginning of each text.
:param suffix: A string to add to the end of each text.
:param batch_size: Number of strings to encode at once.
Expand Down
24 changes: 19 additions & 5 deletions haystack/preview/components/rankers/similarity.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from pathlib import Path
from typing import List, Union, Dict, Any
from typing import List, Union, Dict, Any, Optional

from haystack.preview import ComponentError, Document, component, default_from_dict, default_to_dict
from haystack.preview.lazy_imports import LazyImport
Expand Down Expand Up @@ -34,17 +34,24 @@ class SimilarityRanker:
"""

def __init__(
self, model_name_or_path: Union[str, Path] = "cross-encoder/ms-marco-MiniLM-L-6-v2", device: str = "cpu"
self,
model_name_or_path: Union[str, Path] = "cross-encoder/ms-marco-MiniLM-L-6-v2",
top_k: int = 10,
device: str = "cpu",
):
"""
Creates an instance of SimilarityRanker.
:param model_name_or_path: Path to a pre-trained sentence-transformers model.
:param top_k: The maximum number of documents to return per query.
:param device: torch device (for example, cuda:0, cpu, mps) to limit model inference to a specific device.
"""
torch_and_transformers_import.check()

self.model_name_or_path = model_name_or_path
if top_k <= 0:
raise ValueError(f"top_k must be > 0, but got {top_k}")
self.top_k = top_k
self.device = device
self.model = None
self.tokenizer = None
Expand All @@ -63,7 +70,7 @@ def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
"""
return default_to_dict(self, device=self.device, model_name_or_path=self.model_name_or_path)
return default_to_dict(self, top_k=self.top_k, device=self.device, model_name_or_path=self.model_name_or_path)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "SimilarityRanker":
Expand All @@ -73,17 +80,24 @@ def from_dict(cls, data: Dict[str, Any]) -> "SimilarityRanker":
return default_from_dict(cls, data)

@component.output_types(documents=List[Document])
def run(self, query: str, documents: List[Document]):
def run(self, query: str, documents: List[Document], top_k: Optional[int] = None):
"""
Returns a list of documents ranked by their similarity to the given query
:param query: Query string.
:param documents: List of Documents.
:param top_k: The maximum number of documents to return.
:return: List of Documents sorted by (desc.) similarity with the query.
"""
if not documents:
return {"documents": []}

if top_k is None:
top_k = self.top_k

elif top_k <= 0:
raise ValueError(f"top_k must be > 0, but got {top_k}")

# If a model path is provided but the model isn't loaded
if self.model_name_or_path and not self.model:
raise ComponentError(
Expand All @@ -105,4 +119,4 @@ def run(self, query: str, documents: List[Document]):
i = sorted_index_tensor.item()
documents[i].score = similarity_scores[i].item()
ranked_docs.append(documents[i])
return {"documents": ranked_docs}
return {"documents": ranked_docs[:top_k]}
7 changes: 2 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,6 @@ docstores-gpu = [
audio = [
"openai-whisper"
]
beir = [
"beir; platform_system != 'Windows'",
]
aws = [
"boto3",
# Costraint botocore to avoid taking to much time to resolve the dependency tree.
Expand Down Expand Up @@ -246,11 +243,11 @@ formatting = [
]

all = [
"farm-haystack[inference,docstores,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,onnx,beir,metrics,aws,preview]",
"farm-haystack[inference,docstores,crawler,preprocessing,file-conversion,pdf,ocr,metrics,aws,preview]",
]
all-gpu = [
# beir is incompatible with faiss-gpu: https://github.com/beir-cellar/beir/issues/71
"farm-haystack[inference,docstores-gpu,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,onnx-gpu,metrics,aws,preview]",
"farm-haystack[inference,docstores-gpu,crawler,preprocessing,file-conversion,pdf,ocr,metrics,aws,preview]",
]

[project.scripts]
Expand Down
3 changes: 3 additions & 0 deletions releasenotes/notes/review-all-extras-42d5a3a3d61f5393.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
upgrade:
- |
Removes the `audio`, `ray`, `onnx` and `beir` extras from the extra group `all`.
16 changes: 7 additions & 9 deletions test/pipelines/test_ray.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
from pathlib import Path

import pytest
import ray

from haystack.pipelines import RayPipeline


@pytest.fixture(autouse=True)
def shutdown_ray():
yield
@pytest.fixture()
def ray():
try:
import ray

yield ray

ray.serve.shutdown()
ray.shutdown()
except:
Expand All @@ -21,7 +19,7 @@ def shutdown_ray():
@pytest.mark.integration
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
@pytest.mark.parametrize("serve_detached", [True, False])
def test_load_pipeline(document_store_with_docs, serve_detached, samples_path):
def test_load_pipeline(ray, document_store_with_docs, serve_detached, samples_path):
pipeline = RayPipeline.load_from_yaml(
samples_path / "pipeline" / "ray.simple.haystack-pipeline.yml",
pipeline_name="ray_query_pipeline",
Expand All @@ -43,7 +41,7 @@ def test_load_pipeline(document_store_with_docs, serve_detached, samples_path):

@pytest.mark.integration
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
def test_load_advanced_pipeline(document_store_with_docs, samples_path):
def test_load_advanced_pipeline(ray, document_store_with_docs, samples_path):
pipeline = RayPipeline.load_from_yaml(
samples_path / "pipeline" / "ray.advanced.haystack-pipeline.yml",
pipeline_name="ray_query_pipeline",
Expand Down Expand Up @@ -71,7 +69,7 @@ def test_load_advanced_pipeline(document_store_with_docs, samples_path):
@pytest.mark.asyncio
@pytest.mark.integration
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
async def test_load_advanced_pipeline_async(document_store_with_docs, samples_path):
async def test_load_advanced_pipeline_async(ray, document_store_with_docs, samples_path):
pipeline = RayPipeline.load_from_yaml(
samples_path / "pipeline" / "ray.advanced.haystack-pipeline.yml",
pipeline_name="ray_query_pipeline",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from unittest.mock import patch
import pytest
from haystack.preview.embedding_backends.sentence_transformers_backend import (
from haystack.preview.components.embedders.backends.sentence_transformers_backend import (
_SentenceTransformersEmbeddingBackendFactory,
)


@pytest.mark.unit
@patch("haystack.preview.embedding_backends.sentence_transformers_backend.SentenceTransformer")
@patch("haystack.preview.components.embedders.backends.sentence_transformers_backend.SentenceTransformer")
def test_factory_behavior(mock_sentence_transformer):
embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path="my_model", device="cpu"
Expand All @@ -21,7 +21,7 @@ def test_factory_behavior(mock_sentence_transformer):


@pytest.mark.unit
@patch("haystack.preview.embedding_backends.sentence_transformers_backend.SentenceTransformer")
@patch("haystack.preview.components.embedders.backends.sentence_transformers_backend.SentenceTransformer")
def test_model_initialization(mock_sentence_transformer):
_SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path="model", device="cpu", use_auth_token="my_token"
Expand All @@ -32,7 +32,7 @@ def test_model_initialization(mock_sentence_transformer):


@pytest.mark.unit
@patch("haystack.preview.embedding_backends.sentence_transformers_backend.SentenceTransformer")
@patch("haystack.preview.components.embedders.backends.sentence_transformers_backend.SentenceTransformer")
def test_embedding_function_with_kwargs(mock_sentence_transformer):
embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(model_name_or_path="model")

Expand Down
Loading

0 comments on commit f1b9ecd

Please sign in to comment.