Skip to content

Commit

Permalink
test: MultiFileConverter
Browse files Browse the repository at this point in the history
  • Loading branch information
mathislucka committed Feb 6, 2025
1 parent 3f33ddd commit eac0142
Show file tree
Hide file tree
Showing 15 changed files with 1,853 additions and 74 deletions.
Empty file.
4 changes: 2 additions & 2 deletions haystack_experimental/super_components/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
#
# SPDX-License-Identifier: Apache-2.0

from haystack_experimental.super_components.converters.file_converter import AutoFileConverter
from haystack_experimental.super_components.converters.multi_file_converter import MultiFileConverter

_all_ = ["AutoFileConverter"]
_all_ = ["MultiFileConverter"]
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

from enum import Enum
from typing import Any, Callable, Dict, List, Literal, Optional
from typing import Any, Dict

from haystack import Pipeline, component, default_from_dict, default_to_dict
from haystack.components.converters import (
Expand All @@ -18,11 +18,9 @@
XLSXToDocument,
)
from haystack.components.joiners import DocumentJoiner
from haystack.components.preprocessors.document_splitter import DocumentSplitter, Language
from haystack.components.routers import FileTypeRouter
from haystack.utils import deserialize_callable, serialize_callable

from haystack_experimental.core.super_component import SuperComponentBase
from haystack_experimental.core.super_component import SuperComponent


class ConverterMimeType(str, Enum):
Expand All @@ -38,11 +36,11 @@ class ConverterMimeType(str, Enum):


@component
class AutoFileConverter(SuperComponentBase):
class MultiFileConverter(SuperComponent):
"""
A file converter that handles multiple file types and their pre-processing.
A file converter that handles conversion of multiple file types.
The AutoFileConverter handles the following file types:
The MultiFileConverter handles the following file types:
- CSV
- DOCX
- HTML
Expand All @@ -53,39 +51,18 @@ class AutoFileConverter(SuperComponentBase):
- PPTX
- XLSX
It splits all non-tabular data into Documents as specified by the splitting parameters.
Tabular data (CSV & XLSX) is returned without splitting.
Usage:
```
converter = AutoFileConverter()
converter = MultiFileConverter()
converter.run(sources=["test.txt", "test.pdf"], meta={})
```
"""

def __init__( # noqa: PLR0915
self,
split_by: Literal["function", "page", "passage", "period", "word", "line", "sentence"] = "word",
split_length: int = 250,
split_overlap: int = 30,
split_threshold: int = 0,
splitting_function: Optional[Callable[[str], List[str]]] = None,
respect_sentence_boundary: bool = True,
language: Language = "en",
use_split_rules: bool = True,
extend_abbreviations: bool = True,
encoding: str = "utf-8",
json_content_key: str = "content",
) -> None:
self.split_by = split_by
self.split_length = split_length
self.split_overlap = split_overlap
self.split_threshold = split_threshold
self.splitting_function = splitting_function
self.respect_sentence_boundary = respect_sentence_boundary
self.language = language
self.use_split_rules = use_split_rules
self.extend_abbreviations = extend_abbreviations
self.encoding = encoding
self.json_content_key = json_content_key

Expand Down Expand Up @@ -115,19 +92,8 @@ def __init__( # noqa: PLR0915
xlsx = XLSXToDocument()

joiner = DocumentJoiner()
tabular_joiner = DocumentJoiner()

splitter = DocumentSplitter(
split_by=self.split_by,
split_length=self.split_length,
split_overlap=self.split_overlap,
split_threshold=self.split_threshold,
splitting_function=self.splitting_function,
respect_sentence_boundary=self.respect_sentence_boundary,
language=self.language,
use_split_rules=self.use_split_rules,
extend_abbreviations=self.extend_abbreviations,
)



# Create pipeline and add components
pp = Pipeline()
Expand All @@ -143,11 +109,9 @@ def __init__( # noqa: PLR0915
pp.add_component("pptx", pptx)
pp.add_component("xlsx", xlsx)
pp.add_component("joiner", joiner)
pp.add_component("splitter", splitter)
pp.add_component("tabular_joiner", tabular_joiner)
pp.add_component("csv", csv)


pp.connect(f"router.{ConverterMimeType.CSV.value}", "csv")
pp.connect(f"router.{ConverterMimeType.DOCX.value}", "docx")
pp.connect(f"router.{ConverterMimeType.HTML.value}", "html")
pp.connect(f"router.{ConverterMimeType.JSON.value}", "json")
Expand All @@ -157,8 +121,6 @@ def __init__( # noqa: PLR0915
pp.connect(f"router.{ConverterMimeType.PPTX.value}", "pptx")
pp.connect(f"router.{ConverterMimeType.XLSX.value}", "xlsx")

pp.connect("joiner.documents", "splitter.documents")
pp.connect("splitter.documents", "tabular_joiner.documents")
pp.connect("docx.documents", "joiner.documents")
pp.connect("html.documents", "joiner.documents")
pp.connect("json.documents", "joiner.documents")
Expand All @@ -167,18 +129,17 @@ def __init__( # noqa: PLR0915
pp.connect("pdf.documents", "joiner.documents")
pp.connect("pptx.documents", "joiner.documents")

pp.connect("csv.documents", "tabular_joiner.documents")
pp.connect("xlsx.documents", "tabular_joiner.documents")
pp.connect(f"router.{ConverterMimeType.CSV.value}", "csv")
pp.connect("csv.documents", "joiner.documents")
pp.connect("xlsx.documents", "joiner.documents")


output_mapping = {"tabular_joiner.documents": "documents"}
output_mapping = {"joiner.documents": "documents"}
input_mapping = {
"sources": ["router.sources"],
"meta": ["router.meta"]
}

super(AutoFileConverter, self).__init__(
super(MultiFileConverter, self).__init__(
pipeline=pp,
output_mapping=output_mapping,
input_mapping=input_mapping
Expand All @@ -188,32 +149,15 @@ def to_dict(self) -> Dict[str, Any]:
"""
Serialize this instance to a dictionary.
"""
if self.splitting_function is not None:
splitting_function = serialize_callable(self.splitting_function)
else:
splitting_function = self.splitting_function

return default_to_dict(
self,
split_by=self.split_by,
split_length=self.split_length,
split_overlap=self.split_overlap,
split_threshold=self.split_threshold,
splitting_function=splitting_function,
respect_sentence_boundary=self.respect_sentence_boundary,
language=self.language,
use_split_rules=self.use_split_rules,
extend_abbreviations=self.extend_abbreviations,
encoding=self.encoding,
json_content_key=self.json_content_key,
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "AutoFileConverter":
def from_dict(cls, data: Dict[str, Any]) -> "MultiFileConverter":
"""
Load this instance from a dictionary.
"""
if splitting_function := data["init_parameters"].get("splitting_function"):
data["init_parameters"]["splitting_function"] = deserialize_callable(splitting_function)

return default_from_dict(cls, data)
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy

from haystack_experimental.core.super_component import SuperComponentBase
from haystack_experimental.core.super_component import SuperComponent

DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"


@component
class DocumentIndexer(SuperComponentBase):
class DocumentIndexer(SuperComponent):
"""
A document indexer that takes a list of documents and indexes them using the specified model.
Expand Down
Empty file.
129 changes: 129 additions & 0 deletions test/super_components/converters/test_multi_file_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import pytest
from pathlib import Path
from unittest.mock import Mock, patch

from haystack import Document
from haystack.dataclasses import ByteStream
from haystack_experimental.core.super_component import SuperComponent
from haystack_experimental.super_components.converters.multi_file_converter import MultiFileConverter

@pytest.fixture
def converter():
return MultiFileConverter()


class TestMultiFileConverter:
def test_init_default_params(self, converter):
"""Test initialization with default parameters"""
assert converter.encoding == "utf-8"
assert converter.json_content_key == "content"
assert isinstance(converter, SuperComponent)

def test_init_custom_params(self, converter):
"""Test initialization with custom parameters"""
converter = MultiFileConverter(
encoding="latin-1",
json_content_key="text"
)
assert converter.encoding == "latin-1"
assert converter.json_content_key == "text"

def test_to_dict(self, converter):
"""Test serialization to dictionary"""
data = converter.to_dict()
assert data == {
"type": "haystack_experimental.super_components.converters.multi_file_converter.MultiFileConverter",
"init_parameters": {
"encoding": "utf-8",
"json_content_key": "content"
}
}

def test_from_dict(self):
"""Test deserialization from dictionary"""
data = {
"type": "haystack_experimental.super_components.converters.multi_file_converter.MultiFileConverter",
"init_parameters": {
"encoding": "latin-1",
"json_content_key": "text"
}
}
conv = MultiFileConverter.from_dict(data)
assert conv.encoding == "latin-1"
assert conv.json_content_key == "text"

@pytest.mark.parametrize(
"suffix,file_path",
[
("csv", "csv/sample_1.csv"),
("docx", "docx/sample_docx.docx"),
("html", "html/what_is_haystack.html"),
("json", "json/json_conversion_testfile.json"),
("md", "markdown/sample.md"),
("pdf", "pdf/sample_pdf_1.pdf"),
("pptx", "pptx/sample_pptx.pptx"),
("txt", "txt/doc_1.txt"),
("xlsx", "xlsx/table_empty_rows_and_columns.xlsx"),
]
)
@pytest.mark.integration
def test_run(self, test_files_path, converter, suffix, file_path):
paths = [test_files_path / file_path]
output = converter.run(sources=paths)
docs = output["documents"]

assert len(docs) == 1
assert isinstance(docs[0], Document)
assert docs[0].content is not None
assert docs[0].meta["file_path"].endswith(suffix)

def test_run_with_meta(self, test_files_path, converter):
"""Test conversion with metadata"""
paths = [test_files_path / "txt" / "doc_1.txt"]
meta = {"language": "en", "author": "test"}
output = converter.run(sources=paths, meta=meta)
docs = output["documents"]
assert docs[0].meta["language"] == "en"
assert docs[0].meta["author"] == "test"

def test_run_with_bytestream(self, test_files_path, converter):
"""Test converting ByteStream input"""
bytestream = ByteStream(
data=b"test content",
mime_type="text/plain",
meta={"file_path": "test.txt"}
)
output = converter.run(sources=[bytestream])
docs = output["documents"]
assert len(docs) == 1
assert docs[0].content == "test content"
assert docs[0].meta["file_path"] == "test.txt"

def test_run_error_handling(self, test_files_path, converter, caplog):
"""Test error handling for non-existent files"""
paths = [test_files_path / "non_existent.txt"]
with caplog.at_level("WARNING"):
output = converter.run(sources=paths)
assert "Could not read" in caplog.text
assert len(output["documents"]) == 0

@pytest.mark.integration
def test_run_all_file_types(self, test_files_path, converter):
"""Test converting all supported file types in parallel"""
paths = [
test_files_path / "csv" / "sample_1.csv",
test_files_path / "docx" / "sample_docx.docx",
test_files_path / "html" / "what_is_haystack.html",
test_files_path / "json" / "json_conversion_testfile.json",
test_files_path / "markdown" / "sample.md",
test_files_path / "txt" / "doc_1.txt",
test_files_path / "pdf" / "sample_pdf_1.pdf",
test_files_path / "pptx" / "sample_pptx.pptx",
test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"
]
output = converter.run(sources=paths)
docs = output["documents"]

# Verify we got a document for each file
assert len(docs) == len(paths)
assert all(isinstance(doc, Document) for doc in docs)
4 changes: 4 additions & 0 deletions test/test_files/csv/sample_1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Name,Age
John Doe,27
Jane Smith,37
Mike Johnson,47
Binary file added test/test_files/docx/sample_docx.docx
Binary file not shown.
Loading

0 comments on commit eac0142

Please sign in to comment.