test: MultiFileConverter

deepset-ai · Feb 6, 2025 · eac0142 · eac0142
1 parent 3f33ddd
commit eac0142
Show file tree

Hide file tree

Showing 15 changed files with 1,853 additions and 74 deletions.
diff --git a/haystack_experimental/super_components/__init__.py b/haystack_experimental/super_components/__init__.py
diff --git a/haystack_experimental/super_components/converters/__init__.py b/haystack_experimental/super_components/converters/__init__.py
@@ -2,6 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from haystack_experimental.super_components.converters.file_converter import AutoFileConverter
+from haystack_experimental.super_components.converters.multi_file_converter import MultiFileConverter
 
-_all_ = ["AutoFileConverter"]
+_all_ = ["MultiFileConverter"]
diff --git a/...r_components/converters/file_converter.py → ...onents/converters/multi_file_converter.py b/...r_components/converters/file_converter.py → ...onents/converters/multi_file_converter.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from enum import Enum
-from typing import Any, Callable, Dict, List, Literal, Optional
+from typing import Any, Dict
 
 from haystack import Pipeline, component, default_from_dict, default_to_dict
 from haystack.components.converters import (
@@ -18,11 +18,9 @@
     XLSXToDocument,
 )
 from haystack.components.joiners import DocumentJoiner
-from haystack.components.preprocessors.document_splitter import DocumentSplitter, Language
 from haystack.components.routers import FileTypeRouter
-from haystack.utils import deserialize_callable, serialize_callable
 
-from haystack_experimental.core.super_component import SuperComponentBase
+from haystack_experimental.core.super_component import SuperComponent
 
 
 class ConverterMimeType(str, Enum):
@@ -38,11 +36,11 @@ class ConverterMimeType(str, Enum):
 
 
 @component
-class AutoFileConverter(SuperComponentBase):
+class MultiFileConverter(SuperComponent):
     """
-    A file converter that handles multiple file types and their pre-processing.
+    A file converter that handles conversion of multiple file types.
 
-    The AutoFileConverter handles the following file types:
+    The MultiFileConverter handles the following file types:
     - CSV
     - DOCX
     - HTML
@@ -53,39 +51,18 @@ class AutoFileConverter(SuperComponentBase):
     - PPTX
     - XLSX
 
-    It splits all non-tabular data into Documents as specified by the splitting parameters.
-    Tabular data (CSV & XLSX) is returned without splitting.
-
     Usage:
     ```
-    converter = AutoFileConverter()
+    converter = MultiFileConverter()
     converter.run(sources=["test.txt", "test.pdf"], meta={})
     ```
     """
 
     def __init__( # noqa: PLR0915
         self,
-        split_by: Literal["function", "page", "passage", "period", "word", "line", "sentence"] = "word",
-        split_length: int = 250,
-        split_overlap: int = 30,
-        split_threshold: int = 0,
-        splitting_function: Optional[Callable[[str], List[str]]] = None,
-        respect_sentence_boundary: bool = True,
-        language: Language = "en",
-        use_split_rules: bool = True,
-        extend_abbreviations: bool = True,
         encoding: str = "utf-8",
         json_content_key: str = "content",
     ) -> None:
-        self.split_by = split_by
-        self.split_length = split_length
-        self.split_overlap = split_overlap
-        self.split_threshold = split_threshold
-        self.splitting_function = splitting_function
-        self.respect_sentence_boundary = respect_sentence_boundary
-        self.language = language
-        self.use_split_rules = use_split_rules
-        self.extend_abbreviations = extend_abbreviations
         self.encoding = encoding
         self.json_content_key = json_content_key
 
@@ -115,19 +92,8 @@ def __init__( # noqa: PLR0915
         xlsx = XLSXToDocument()
 
         joiner = DocumentJoiner()
-        tabular_joiner = DocumentJoiner()
-
-        splitter = DocumentSplitter(
-            split_by=self.split_by,
-            split_length=self.split_length,
-            split_overlap=self.split_overlap,
-            split_threshold=self.split_threshold,
-            splitting_function=self.splitting_function,
-            respect_sentence_boundary=self.respect_sentence_boundary,
-            language=self.language,
-            use_split_rules=self.use_split_rules,
-            extend_abbreviations=self.extend_abbreviations,
-        )
+
+
 
         # Create pipeline and add components
         pp = Pipeline()
@@ -143,11 +109,9 @@ def __init__( # noqa: PLR0915
         pp.add_component("pptx", pptx)
         pp.add_component("xlsx", xlsx)
         pp.add_component("joiner", joiner)
-        pp.add_component("splitter", splitter)
-        pp.add_component("tabular_joiner", tabular_joiner)
         pp.add_component("csv", csv)
 
-
+        pp.connect(f"router.{ConverterMimeType.CSV.value}", "csv")
         pp.connect(f"router.{ConverterMimeType.DOCX.value}", "docx")
         pp.connect(f"router.{ConverterMimeType.HTML.value}", "html")
         pp.connect(f"router.{ConverterMimeType.JSON.value}", "json")
@@ -157,8 +121,6 @@ def __init__( # noqa: PLR0915
         pp.connect(f"router.{ConverterMimeType.PPTX.value}", "pptx")
         pp.connect(f"router.{ConverterMimeType.XLSX.value}", "xlsx")
 
-        pp.connect("joiner.documents", "splitter.documents")
-        pp.connect("splitter.documents", "tabular_joiner.documents")
         pp.connect("docx.documents", "joiner.documents")
         pp.connect("html.documents", "joiner.documents")
         pp.connect("json.documents", "joiner.documents")
@@ -167,18 +129,17 @@ def __init__( # noqa: PLR0915
         pp.connect("pdf.documents", "joiner.documents")
         pp.connect("pptx.documents", "joiner.documents")
 
-        pp.connect("csv.documents", "tabular_joiner.documents")
-        pp.connect("xlsx.documents", "tabular_joiner.documents")
-        pp.connect(f"router.{ConverterMimeType.CSV.value}", "csv")
+        pp.connect("csv.documents", "joiner.documents")
+        pp.connect("xlsx.documents", "joiner.documents")
 
 
-        output_mapping = {"tabular_joiner.documents": "documents"}
+        output_mapping = {"joiner.documents": "documents"}
         input_mapping = {
             "sources": ["router.sources"],
             "meta": ["router.meta"]
         }
 
-        super(AutoFileConverter, self).__init__(
+        super(MultiFileConverter, self).__init__(
             pipeline=pp,
             output_mapping=output_mapping,
             input_mapping=input_mapping
@@ -188,32 +149,15 @@ def to_dict(self) -> Dict[str, Any]:
         """
         Serialize this instance to a dictionary.
         """
-        if self.splitting_function is not None:
-            splitting_function = serialize_callable(self.splitting_function)
-        else:
-            splitting_function = self.splitting_function
-
         return default_to_dict(
             self,
-            split_by=self.split_by,
-            split_length=self.split_length,
-            split_overlap=self.split_overlap,
-            split_threshold=self.split_threshold,
-            splitting_function=splitting_function,
-            respect_sentence_boundary=self.respect_sentence_boundary,
-            language=self.language,
-            use_split_rules=self.use_split_rules,
-            extend_abbreviations=self.extend_abbreviations,
             encoding=self.encoding,
             json_content_key=self.json_content_key,
         )
 
     @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> "AutoFileConverter":
+    def from_dict(cls, data: Dict[str, Any]) -> "MultiFileConverter":
         """
         Load this instance from a dictionary.
         """
-        if splitting_function := data["init_parameters"].get("splitting_function"):
-            data["init_parameters"]["splitting_function"] = deserialize_callable(splitting_function)
-
         return default_from_dict(cls, data)
diff --git a/haystack_experimental/super_components/indexers/document_indexer.py b/haystack_experimental/super_components/indexers/document_indexer.py
@@ -10,13 +10,13 @@
 from haystack.document_stores.in_memory import InMemoryDocumentStore
 from haystack.document_stores.types import DuplicatePolicy
 
-from haystack_experimental.core.super_component import SuperComponentBase
+from haystack_experimental.core.super_component import SuperComponent
 
 DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
 
 
 @component
-class DocumentIndexer(SuperComponentBase):
+class DocumentIndexer(SuperComponent):
     """
     A document indexer that takes a list of documents and indexes them using the specified model.
 

diff --git a/test/super_components/converters/__init__.py b/test/super_components/converters/__init__.py
diff --git a/test/super_components/converters/test_multi_file_converter.py b/test/super_components/converters/test_multi_file_converter.py
@@ -0,0 +1,129 @@
+import pytest
+from pathlib import Path
+from unittest.mock import Mock, patch
+
+from haystack import Document
+from haystack.dataclasses import ByteStream
+from haystack_experimental.core.super_component import SuperComponent
+from haystack_experimental.super_components.converters.multi_file_converter import MultiFileConverter
+
+@pytest.fixture
+def converter():
+    return MultiFileConverter()
+
+
+class TestMultiFileConverter:
+    def test_init_default_params(self, converter):
+        """Test initialization with default parameters"""
+        assert converter.encoding == "utf-8"
+        assert converter.json_content_key == "content"
+        assert isinstance(converter, SuperComponent)
+
+    def test_init_custom_params(self, converter):
+        """Test initialization with custom parameters"""
+        converter = MultiFileConverter(
+            encoding="latin-1",
+            json_content_key="text"
+        )
+        assert converter.encoding == "latin-1"
+        assert converter.json_content_key == "text"
+
+    def test_to_dict(self, converter):
+        """Test serialization to dictionary"""
+        data = converter.to_dict()
+        assert data == {
+            "type": "haystack_experimental.super_components.converters.multi_file_converter.MultiFileConverter",
+            "init_parameters": {
+                "encoding": "utf-8",
+                "json_content_key": "content"
+            }
+        }
+
+    def test_from_dict(self):
+        """Test deserialization from dictionary"""
+        data = {
+            "type": "haystack_experimental.super_components.converters.multi_file_converter.MultiFileConverter",
+            "init_parameters": {
+                "encoding": "latin-1",
+                "json_content_key": "text"
+            }
+        }
+        conv = MultiFileConverter.from_dict(data)
+        assert conv.encoding == "latin-1"
+        assert conv.json_content_key == "text"
+
+    @pytest.mark.parametrize(
+        "suffix,file_path",
+        [
+            ("csv", "csv/sample_1.csv"),
+            ("docx", "docx/sample_docx.docx"),
+            ("html", "html/what_is_haystack.html"),
+            ("json", "json/json_conversion_testfile.json"),
+            ("md", "markdown/sample.md"),
+            ("pdf", "pdf/sample_pdf_1.pdf"),
+            ("pptx", "pptx/sample_pptx.pptx"),
+            ("txt", "txt/doc_1.txt"),
+            ("xlsx", "xlsx/table_empty_rows_and_columns.xlsx"),
+        ]
+    )
+    @pytest.mark.integration
+    def test_run(self, test_files_path, converter, suffix, file_path):
+        paths = [test_files_path / file_path]
+        output = converter.run(sources=paths)
+        docs = output["documents"]
+
+        assert len(docs) == 1
+        assert isinstance(docs[0], Document)
+        assert docs[0].content is not None
+        assert docs[0].meta["file_path"].endswith(suffix)
+
+    def test_run_with_meta(self, test_files_path, converter):
+        """Test conversion with metadata"""
+        paths = [test_files_path / "txt" / "doc_1.txt"]
+        meta = {"language": "en", "author": "test"}
+        output = converter.run(sources=paths, meta=meta)
+        docs = output["documents"]
+        assert docs[0].meta["language"] == "en"
+        assert docs[0].meta["author"] == "test"
+
+    def test_run_with_bytestream(self, test_files_path, converter):
+        """Test converting ByteStream input"""
+        bytestream = ByteStream(
+            data=b"test content",
+            mime_type="text/plain",
+            meta={"file_path": "test.txt"}
+        )
+        output = converter.run(sources=[bytestream])
+        docs = output["documents"]
+        assert len(docs) == 1
+        assert docs[0].content == "test content"
+        assert docs[0].meta["file_path"] == "test.txt"
+
+    def test_run_error_handling(self, test_files_path, converter, caplog):
+        """Test error handling for non-existent files"""
+        paths = [test_files_path / "non_existent.txt"]
+        with caplog.at_level("WARNING"):
+            output = converter.run(sources=paths)
+            assert "Could not read" in caplog.text
+            assert len(output["documents"]) == 0
+
+    @pytest.mark.integration
+    def test_run_all_file_types(self, test_files_path, converter):
+        """Test converting all supported file types in parallel"""
+        paths = [
+            test_files_path / "csv" / "sample_1.csv",
+            test_files_path / "docx" / "sample_docx.docx",
+            test_files_path / "html" / "what_is_haystack.html",
+            test_files_path / "json" / "json_conversion_testfile.json",
+            test_files_path / "markdown" / "sample.md",
+            test_files_path / "txt" / "doc_1.txt",
+            test_files_path / "pdf" / "sample_pdf_1.pdf",
+            test_files_path / "pptx" / "sample_pptx.pptx",
+            test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"
+        ]
+        output = converter.run(sources=paths)
+        docs = output["documents"]
+
+        # Verify we got a document for each file
+        assert len(docs) == len(paths)
+        assert all(isinstance(doc, Document) for doc in docs)
diff --git a/test/test_files/csv/sample_1.csv b/test/test_files/csv/sample_1.csv
@@ -0,0 +1,4 @@
+Name,Age
+John Doe,27
+Jane Smith,37
+Mike Johnson,47
diff --git a/test/test_files/docx/sample_docx.docx b/test/test_files/docx/sample_docx.docx