Merge branch 'main' into current_date_template

deepset-ai · Sep 6, 2024 · a304aa5 · a304aa5
2 parents 3e58425 + e31b3ed
commit a304aa5
Show file tree

Hide file tree

Showing 14 changed files with 230 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -61,6 +61,15 @@ Some examples of what you can do with Haystack:
 >
 > Are you looking for a managed solution that benefits from Haystack? [deepset Cloud](https://www.deepset.ai/deepset-cloud?utm_campaign=developer-relations&utm_source=haystack&utm_medium=readme) is our fully managed, end-to-end platform to integrate LLMs with your data, which uses Haystack for the LLM pipelines architecture.
 
+## 🔜 Visual Pipeline Editor
+
+Use **deepset Studio** to visually create and export your Haystack pipeline architecture as a YAML or as Python code. Learn more about it in [our announcement post](https://haystack.deepset.ai/blog/announcing-studio).
+
+![studio](https://github.com/user-attachments/assets/e4f09746-20b5-433e-8261-eca224ac23b3)
+
+
+👉 [Join the waitlist](https://landing.deepset.ai/deepset-studio-waitlist?utm_campaign=2408%20-%20Campaign%20-%20Studio%20Launch&utm_source=github&utm_medium=referral)!
+
 ## Telemetry
 
 Haystack collects **anonymous** usage statistics of pipeline components. We receive an event every time these components are initialized. This way, we know which components are most relevant to our community.

diff --git a/docs/pydoc/config/converters_api.yml b/docs/pydoc/config/converters_api.yml
@@ -13,7 +13,8 @@ loaders:
         "txt",
         "output_adapter",
         "openapi_functions",
-        "docx"
+        "docx",
+        "csv"
       ]
     ignore_when_discovered: ["__init__"]
 processors:

diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from haystack.components.converters.azure import AzureOCRDocumentConverter
+from haystack.components.converters.csv import CSVToDocument
 from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument
 from haystack.components.converters.html import HTMLToDocument
 from haystack.components.converters.markdown import MarkdownToDocument
@@ -27,4 +28,5 @@
     "DOCXToDocument",
     "DOCXMetadata",
     "PPTXToDocument",
+    "CSVToDocument",
 ]
diff --git a/haystack/components/converters/csv.py b/haystack/components/converters/csv.py
@@ -0,0 +1,93 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import io
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+from haystack import Document, component, logging
+from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
+from haystack.dataclasses import ByteStream
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class CSVToDocument:
+    """
+    Converts CSV files to Documents.
+
+    By default, it uses UTF-8 encoding when converting files but
+    you can also set a custom encoding.
+    It can attach metadata to the resulting documents.
+
+    ### Usage example
+
+    ```python
+    from haystack.components.converters.csv import CSVToDocument
+    converter = CSVToDocument()
+    results = converter.run(sources=["sample.csv"], meta={"date_added": datetime.now().isoformat()})
+    documents = results["documents"]
+    print(documents[0].content)
+    # 'col1,col2\now1,row1\nrow2row2\n'
+    ```
+    """
+
+    def __init__(self, encoding: str = "utf-8"):
+        """
+        Creates a CSVToDocument component.
+
+        :param encoding:
+            The encoding of the csv files to convert.
+            If the encoding is specified in the metadata of a source ByteStream,
+            it overrides this value.
+        """
+        self.encoding = encoding
+
+    @component.output_types(documents=List[Document])
+    def run(
+        self,
+        sources: List[Union[str, Path, ByteStream]],
+        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+    ):
+        """
+        Converts a CSV file to a Document.
+
+        :param sources:
+            List of file paths or ByteStream objects.
+        :param meta:
+            Optional metadata to attach to the documents.
+            This value can be either a list of dictionaries or a single dictionary.
+            If it's a single dictionary, its content is added to the metadata of all produced documents.
+            If it's a list, the length of the list must match the number of sources, because the two lists will
+            be zipped.
+            If `sources` contains ByteStream objects, their `meta` will be added to the output documents.
+        :returns:
+            A dictionary with the following keys:
+            - `documents`: Created documents
+        """
+        documents = []
+
+        meta_list = normalize_metadata(meta, sources_count=len(sources))
+
+        for source, metadata in zip(sources, meta_list):
+            try:
+                bytestream = get_bytestream_from_source(source)
+            except Exception as e:
+                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
+                continue
+            try:
+                encoding = bytestream.meta.get("encoding", self.encoding)
+                data = io.BytesIO(bytestream.data).getvalue().decode(encoding=encoding)
+            except Exception as e:
+                logger.warning(
+                    "Could not convert file {source}. Skipping it. Error message: {error}", source=source, error=e
+                )
+                continue
+
+            merged_metadata = {**bytestream.meta, **metadata}
+            document = Document(content=data, meta=merged_metadata)
+            documents.append(document)
+
+        return {"documents": documents}
diff --git a/haystack/components/embedders/sentence_transformers_document_embedder.py b/haystack/components/embedders/sentence_transformers_document_embedder.py
@@ -187,6 +187,8 @@ def warm_up(self):
                 model_kwargs=self.model_kwargs,
                 tokenizer_kwargs=self.tokenizer_kwargs,
             )
+            if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"):
+                self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"]
 
     @component.output_types(documents=List[Document])
     def run(self, documents: List[Document]):

diff --git a/haystack/components/embedders/sentence_transformers_text_embedder.py b/haystack/components/embedders/sentence_transformers_text_embedder.py
@@ -173,6 +173,8 @@ def warm_up(self):
                 model_kwargs=self.model_kwargs,
                 tokenizer_kwargs=self.tokenizer_kwargs,
             )
+            if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"):
+                self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"]
 
     @component.output_types(embedding=List[float])
     def run(self, text: str):

diff --git a/releasenotes/notes/add-csv-converter-5c0d52f180d498f5.yaml b/releasenotes/notes/add-csv-converter-5c0d52f180d498f5.yaml
@@ -0,0 +1,4 @@
+---
+features:
+  - |
+    Add a CSV to Document converter component. Loads the file as bytes object. Adds the loaded string as a new document that can be used for further processing by the Document Splitter.
diff --git a/releasenotes/notes/update-max-seq-lenght-st-1dc3d7a9c9a3bdcd.yaml b/releasenotes/notes/update-max-seq-lenght-st-1dc3d7a9c9a3bdcd.yaml
@@ -0,0 +1,4 @@
+---
+features:
+  - |
+    Updates SentenceTransformersDocumentEmbedder and SentenceTransformersTextEmbedder so model_max_length passed through tokenizer_kwargs also updates the max_seq_length of the underly SentenceTransformer model.
diff --git a/test/components/converters/test_csv_to_document.py b/test/components/converters/test_csv_to_document.py
@@ -0,0 +1,86 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
+#
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from unittest.mock import patch
+import pandas as pd
+from pathlib import Path
+
+import pytest
+
+from haystack.dataclasses import ByteStream
+from haystack.components.converters.csv import CSVToDocument
+
+
+@pytest.fixture
+def csv_converter():
+    return CSVToDocument()
+
+
+class TestCSVToDocument:
+    def test_init(self, csv_converter):
+        assert isinstance(csv_converter, CSVToDocument)
+
+    def test_run(self, test_files_path):
+        """
+        Test if the component runs correctly.
+        """
+        bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv")
+        bytestream.meta["file_path"] = str(test_files_path / "csv" / "sample_1.csv")
+        bytestream.meta["key"] = "value"
+        files = [bytestream, test_files_path / "csv" / "sample_2.csv", test_files_path / "csv" / "sample_3.csv"]
+        converter = CSVToDocument()
+        output = converter.run(sources=files)
+        docs = output["documents"]
+        assert len(docs) == 3
+        assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content
+        assert isinstance(docs[0].content, str)
+        assert docs[0].meta == bytestream.meta
+        assert docs[1].meta["file_path"] == str(files[1])
+        assert docs[2].meta["file_path"] == str(files[2])
+
+    def test_run_error_handling(self, test_files_path, caplog):
+        """
+        Test if the component correctly handles errors.
+        """
+        paths = [
+            test_files_path / "csv" / "sample_2.csv",
+            "non_existing_file.csv",
+            test_files_path / "csv" / "sample_3.csv",
+        ]
+        converter = CSVToDocument()
+        with caplog.at_level(logging.WARNING):
+            output = converter.run(sources=paths)
+            assert "non_existing_file.csv" in caplog.text
+        docs = output["documents"]
+        assert len(docs) == 2
+        assert docs[0].meta["file_path"] == str(paths[0])
+
+    def test_encoding_override(self, test_files_path, caplog):
+        """
+        Test if the encoding metadata field is used properly
+        """
+        bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv")
+        bytestream.meta["key"] = "value"
+
+        converter = CSVToDocument(encoding="utf-16-le")
+        output = converter.run(sources=[bytestream])
+        with caplog.at_level(logging.ERROR):
+            output = converter.run(sources=[bytestream])
+            assert "codec can't decode" in caplog.text
+
+        converter = CSVToDocument(encoding="utf-8")
+        output = converter.run(sources=[bytestream])
+        assert "Name,Age\r\n" in output["documents"][0].content
+
+    def test_run_with_meta(self):
+        bytestream = ByteStream(
+            data=b"Name,Age,City\r\nAlice,30,New York\r\nBob,25,Los Angeles\r\nCharlie,35,Chicago\r\n",
+            meta={"name": "test_name", "language": "en"},
+        )
+        converter = CSVToDocument()
+        output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
+        document = output["documents"][0]
+
+        # check that the metadata from the bytestream is merged with that from the meta parameter
+        assert document.meta == {"name": "test_name", "language": "it"}
diff --git a/test/components/embedders/test_sentence_transformers_document_embedder.py b/test/components/embedders/test_sentence_transformers_document_embedder.py
@@ -226,18 +226,22 @@ def test_from_dict_none_device(self):
     )
     def test_warmup(self, mocked_factory):
         embedder = SentenceTransformersDocumentEmbedder(
-            model="model", token=None, device=ComponentDevice.from_str("cpu")
+            model="model",
+            token=None,
+            device=ComponentDevice.from_str("cpu"),
+            tokenizer_kwargs={"model_max_length": 512},
         )
         mocked_factory.get_embedding_backend.assert_not_called()
         embedder.warm_up()
+        embedder.embedding_backend.model.max_seq_length = 512
         mocked_factory.get_embedding_backend.assert_called_once_with(
             model="model",
             device="cpu",
             auth_token=None,
             trust_remote_code=False,
             truncate_dim=None,
             model_kwargs=None,
-            tokenizer_kwargs=None,
+            tokenizer_kwargs={"model_max_length": 512},
         )
 
     @patch(

diff --git a/test/components/embedders/test_sentence_transformers_text_embedder.py b/test/components/embedders/test_sentence_transformers_text_embedder.py
@@ -201,17 +201,23 @@ def test_from_dict_none_device(self):
         "haystack.components.embedders.sentence_transformers_text_embedder._SentenceTransformersEmbeddingBackendFactory"
     )
     def test_warmup(self, mocked_factory):
-        embedder = SentenceTransformersTextEmbedder(model="model", token=None, device=ComponentDevice.from_str("cpu"))
+        embedder = SentenceTransformersTextEmbedder(
+            model="model",
+            token=None,
+            device=ComponentDevice.from_str("cpu"),
+            tokenizer_kwargs={"model_max_length": 512},
+        )
         mocked_factory.get_embedding_backend.assert_not_called()
         embedder.warm_up()
+        embedder.embedding_backend.model.max_seq_length = 512
         mocked_factory.get_embedding_backend.assert_called_once_with(
             model="model",
             device="cpu",
             auth_token=None,
             trust_remote_code=False,
             truncate_dim=None,
             model_kwargs=None,
-            tokenizer_kwargs=None,
+            tokenizer_kwargs={"model_max_length": 512},
         )
 
     @patch(

diff --git a/test/test_files/csv/sample_1.csv b/test/test_files/csv/sample_1.csv
@@ -0,0 +1,4 @@
+Name,Age
+John Doe,27
+Jane Smith,37
+Mike Johnson,47
diff --git a/test/test_files/csv/sample_2.csv b/test/test_files/csv/sample_2.csv
@@ -0,0 +1,4 @@
+Name,City
+John Doe,New York
+Jane Smith,Los Angeles
+Mike Johnson,Chicago
diff --git a/test/test_files/csv/sample_3.csv b/test/test_files/csv/sample_3.csv
@@ -0,0 +1,4 @@
+Name,Email
+John Doe,[email protected]
+Jane Smith,[email protected]
+Mike Johnson,[email protected]