Skip to content

Commit

Permalink
Merge branch 'main' into current_date_template
Browse files Browse the repository at this point in the history
  • Loading branch information
medsriha authored Sep 6, 2024
2 parents 3e58425 + e31b3ed commit a304aa5
Show file tree
Hide file tree
Showing 14 changed files with 230 additions and 5 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,15 @@ Some examples of what you can do with Haystack:
>
> Are you looking for a managed solution that benefits from Haystack? [deepset Cloud](https://www.deepset.ai/deepset-cloud?utm_campaign=developer-relations&utm_source=haystack&utm_medium=readme) is our fully managed, end-to-end platform to integrate LLMs with your data, which uses Haystack for the LLM pipelines architecture.
## 🔜 Visual Pipeline Editor

Use **deepset Studio** to visually create and export your Haystack pipeline architecture as a YAML or as Python code. Learn more about it in [our announcement post](https://haystack.deepset.ai/blog/announcing-studio).

![studio](https://github.com/user-attachments/assets/e4f09746-20b5-433e-8261-eca224ac23b3)


👉 [Join the waitlist](https://landing.deepset.ai/deepset-studio-waitlist?utm_campaign=2408%20-%20Campaign%20-%20Studio%20Launch&utm_source=github&utm_medium=referral)!

## Telemetry

Haystack collects **anonymous** usage statistics of pipeline components. We receive an event every time these components are initialized. This way, we know which components are most relevant to our community.
Expand Down
3 changes: 2 additions & 1 deletion docs/pydoc/config/converters_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ loaders:
"txt",
"output_adapter",
"openapi_functions",
"docx"
"docx",
"csv"
]
ignore_when_discovered: ["__init__"]
processors:
Expand Down
2 changes: 2 additions & 0 deletions haystack/components/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

from haystack.components.converters.azure import AzureOCRDocumentConverter
from haystack.components.converters.csv import CSVToDocument
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument
from haystack.components.converters.html import HTMLToDocument
from haystack.components.converters.markdown import MarkdownToDocument
Expand All @@ -27,4 +28,5 @@
"DOCXToDocument",
"DOCXMetadata",
"PPTXToDocument",
"CSVToDocument",
]
93 changes: 93 additions & 0 deletions haystack/components/converters/csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

import io
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

from haystack import Document, component, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream

logger = logging.getLogger(__name__)


@component
class CSVToDocument:
"""
Converts CSV files to Documents.
By default, it uses UTF-8 encoding when converting files but
you can also set a custom encoding.
It can attach metadata to the resulting documents.
### Usage example
```python
from haystack.components.converters.csv import CSVToDocument
converter = CSVToDocument()
results = converter.run(sources=["sample.csv"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# 'col1,col2\now1,row1\nrow2row2\n'
```
"""

def __init__(self, encoding: str = "utf-8"):
"""
Creates a CSVToDocument component.
:param encoding:
The encoding of the csv files to convert.
If the encoding is specified in the metadata of a source ByteStream,
it overrides this value.
"""
self.encoding = encoding

@component.output_types(documents=List[Document])
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Converts a CSV file to a Document.
:param sources:
List of file paths or ByteStream objects.
:param meta:
Optional metadata to attach to the documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced documents.
If it's a list, the length of the list must match the number of sources, because the two lists will
be zipped.
If `sources` contains ByteStream objects, their `meta` will be added to the output documents.
:returns:
A dictionary with the following keys:
- `documents`: Created documents
"""
documents = []

meta_list = normalize_metadata(meta, sources_count=len(sources))

for source, metadata in zip(sources, meta_list):
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
continue
try:
encoding = bytestream.meta.get("encoding", self.encoding)
data = io.BytesIO(bytestream.data).getvalue().decode(encoding=encoding)
except Exception as e:
logger.warning(
"Could not convert file {source}. Skipping it. Error message: {error}", source=source, error=e
)
continue

merged_metadata = {**bytestream.meta, **metadata}
document = Document(content=data, meta=merged_metadata)
documents.append(document)

return {"documents": documents}
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ def warm_up(self):
model_kwargs=self.model_kwargs,
tokenizer_kwargs=self.tokenizer_kwargs,
)
if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"):
self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"]

@component.output_types(documents=List[Document])
def run(self, documents: List[Document]):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,8 @@ def warm_up(self):
model_kwargs=self.model_kwargs,
tokenizer_kwargs=self.tokenizer_kwargs,
)
if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"):
self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"]

@component.output_types(embedding=List[float])
def run(self, text: str):
Expand Down
4 changes: 4 additions & 0 deletions releasenotes/notes/add-csv-converter-5c0d52f180d498f5.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Add a CSV to Document converter component. Loads the file as bytes object. Adds the loaded string as a new document that can be used for further processing by the Document Splitter.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Updates SentenceTransformersDocumentEmbedder and SentenceTransformersTextEmbedder so model_max_length passed through tokenizer_kwargs also updates the max_seq_length of the underly SentenceTransformer model.
86 changes: 86 additions & 0 deletions test/components/converters/test_csv_to_document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
import logging
from unittest.mock import patch
import pandas as pd
from pathlib import Path

import pytest

from haystack.dataclasses import ByteStream
from haystack.components.converters.csv import CSVToDocument


@pytest.fixture
def csv_converter():
return CSVToDocument()


class TestCSVToDocument:
def test_init(self, csv_converter):
assert isinstance(csv_converter, CSVToDocument)

def test_run(self, test_files_path):
"""
Test if the component runs correctly.
"""
bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv")
bytestream.meta["file_path"] = str(test_files_path / "csv" / "sample_1.csv")
bytestream.meta["key"] = "value"
files = [bytestream, test_files_path / "csv" / "sample_2.csv", test_files_path / "csv" / "sample_3.csv"]
converter = CSVToDocument()
output = converter.run(sources=files)
docs = output["documents"]
assert len(docs) == 3
assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content
assert isinstance(docs[0].content, str)
assert docs[0].meta == bytestream.meta
assert docs[1].meta["file_path"] == str(files[1])
assert docs[2].meta["file_path"] == str(files[2])

def test_run_error_handling(self, test_files_path, caplog):
"""
Test if the component correctly handles errors.
"""
paths = [
test_files_path / "csv" / "sample_2.csv",
"non_existing_file.csv",
test_files_path / "csv" / "sample_3.csv",
]
converter = CSVToDocument()
with caplog.at_level(logging.WARNING):
output = converter.run(sources=paths)
assert "non_existing_file.csv" in caplog.text
docs = output["documents"]
assert len(docs) == 2
assert docs[0].meta["file_path"] == str(paths[0])

def test_encoding_override(self, test_files_path, caplog):
"""
Test if the encoding metadata field is used properly
"""
bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv")
bytestream.meta["key"] = "value"

converter = CSVToDocument(encoding="utf-16-le")
output = converter.run(sources=[bytestream])
with caplog.at_level(logging.ERROR):
output = converter.run(sources=[bytestream])
assert "codec can't decode" in caplog.text

converter = CSVToDocument(encoding="utf-8")
output = converter.run(sources=[bytestream])
assert "Name,Age\r\n" in output["documents"][0].content

def test_run_with_meta(self):
bytestream = ByteStream(
data=b"Name,Age,City\r\nAlice,30,New York\r\nBob,25,Los Angeles\r\nCharlie,35,Chicago\r\n",
meta={"name": "test_name", "language": "en"},
)
converter = CSVToDocument()
output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
document = output["documents"][0]

# check that the metadata from the bytestream is merged with that from the meta parameter
assert document.meta == {"name": "test_name", "language": "it"}
Original file line number Diff line number Diff line change
Expand Up @@ -226,18 +226,22 @@ def test_from_dict_none_device(self):
)
def test_warmup(self, mocked_factory):
embedder = SentenceTransformersDocumentEmbedder(
model="model", token=None, device=ComponentDevice.from_str("cpu")
model="model",
token=None,
device=ComponentDevice.from_str("cpu"),
tokenizer_kwargs={"model_max_length": 512},
)
mocked_factory.get_embedding_backend.assert_not_called()
embedder.warm_up()
embedder.embedding_backend.model.max_seq_length = 512
mocked_factory.get_embedding_backend.assert_called_once_with(
model="model",
device="cpu",
auth_token=None,
trust_remote_code=False,
truncate_dim=None,
model_kwargs=None,
tokenizer_kwargs=None,
tokenizer_kwargs={"model_max_length": 512},
)

@patch(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,17 +201,23 @@ def test_from_dict_none_device(self):
"haystack.components.embedders.sentence_transformers_text_embedder._SentenceTransformersEmbeddingBackendFactory"
)
def test_warmup(self, mocked_factory):
embedder = SentenceTransformersTextEmbedder(model="model", token=None, device=ComponentDevice.from_str("cpu"))
embedder = SentenceTransformersTextEmbedder(
model="model",
token=None,
device=ComponentDevice.from_str("cpu"),
tokenizer_kwargs={"model_max_length": 512},
)
mocked_factory.get_embedding_backend.assert_not_called()
embedder.warm_up()
embedder.embedding_backend.model.max_seq_length = 512
mocked_factory.get_embedding_backend.assert_called_once_with(
model="model",
device="cpu",
auth_token=None,
trust_remote_code=False,
truncate_dim=None,
model_kwargs=None,
tokenizer_kwargs=None,
tokenizer_kwargs={"model_max_length": 512},
)

@patch(
Expand Down
4 changes: 4 additions & 0 deletions test/test_files/csv/sample_1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Name,Age
John Doe,27
Jane Smith,37
Mike Johnson,47
4 changes: 4 additions & 0 deletions test/test_files/csv/sample_2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Name,City
John Doe,New York
Jane Smith,Los Angeles
Mike Johnson,Chicago
4 changes: 4 additions & 0 deletions test/test_files/csv/sample_3.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Name,Email
John Doe,[email protected]
Jane Smith,[email protected]
Mike Johnson,[email protected]

0 comments on commit a304aa5

Please sign in to comment.