diff --git a/docs/pydoc/config/converters_api.yml b/docs/pydoc/config/converters_api.yml index 6c89138c98..69a35d4e5b 100644 --- a/docs/pydoc/config/converters_api.yml +++ b/docs/pydoc/config/converters_api.yml @@ -16,6 +16,7 @@ loaders: "pypdf", "tika", "txt", + "xlsx", ] ignore_when_discovered: ["__init__"] processors: diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py index 4561dd1e0a..2c7ed33505 100644 --- a/haystack/components/converters/__init__.py +++ b/haystack/components/converters/__init__.py @@ -15,6 +15,7 @@ from haystack.components.converters.pypdf import PyPDFToDocument from haystack.components.converters.tika import TikaDocumentConverter from haystack.components.converters.txt import TextFileToDocument +from haystack.components.converters.xlsx import XLSXToDocument __all__ = [ "TextFileToDocument", @@ -31,4 +32,5 @@ "PPTXToDocument", "CSVToDocument", "JSONConverter", + "XLSXToDocument", ] diff --git a/haystack/components/converters/xlsx.py b/haystack/components/converters/xlsx.py new file mode 100644 index 0000000000..db7dca8fed --- /dev/null +++ b/haystack/components/converters/xlsx.py @@ -0,0 +1,180 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import io +from pathlib import Path +from typing import Any, Dict, List, Literal, Optional, Tuple, Union + +import pandas as pd + +from haystack import Document, component, logging +from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata +from haystack.dataclasses import ByteStream +from haystack.lazy_imports import LazyImport + +logger = logging.getLogger(__name__) + +with LazyImport("Run 'pip install openpyxl'") as xlsx_import: + import openpyxl # pylint: disable=unused-import # the library is used but not directly referenced + +with LazyImport("Run 'pip install tabulate'") as tabulate_import: + from tabulate import tabulate # pylint: disable=unused-import # the library is used but not directly referenced + + +@component +class XLSXToDocument: + """ + Converts XLSX (Excel) files into Documents. + + Supports reading data from specific sheets or all sheets in the Excel file. If all sheets are read, a Document is + created for each sheet. The content of the Document is the table which can be saved in CSV or Markdown format. + + ### Usage example + + ```python + from haystack.components.converters.xlsx import XLSXToDocument + + converter = XLSXToDocument() + results = converter.run(sources=["sample.xlsx"], meta={"date_added": datetime.now().isoformat()}) + documents = results["documents"] + print(documents[0].content) + # ",A,B\n1,col_a,col_b\n2,1.5,test\n" + ``` + """ + + def __init__( + self, + table_format: Literal["csv", "markdown"] = "csv", + sheet_name: Union[str, int, List[Union[str, int]], None] = None, + read_excel_kwargs: Optional[Dict[str, Any]] = None, + table_format_kwargs: Optional[Dict[str, Any]] = None, + ): + """ + Creates a XLSXToDocument component. + + :param table_format: The format to convert the Excel file to. + :param sheet_name: The name of the sheet to read. If None, all sheets are read. + :param read_excel_kwargs: Additional arguments to pass to `pandas.read_excel`. + See https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html#pandas-read-excel + :param table_format_kwargs: Additional keyword arguments to pass to the table format function. + - If `table_format` is "csv", these arguments are passed to `pandas.DataFrame.to_csv`. + See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html#pandas-dataframe-to-csv + - If `table_format` is "markdown", these arguments are passed to `pandas.DataFrame.to_markdown`. + See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_markdown.html#pandas-dataframe-to-markdown + """ + xlsx_import.check() + self.table_format = table_format + if table_format not in ["csv", "markdown"]: + raise ValueError(f"Unsupported export format: {table_format}. Choose either 'csv' or 'markdown'.") + if table_format == "markdown": + tabulate_import.check() + self.sheet_name = sheet_name + self.read_excel_kwargs = read_excel_kwargs or {} + self.table_format_kwargs = table_format_kwargs or {} + + @component.output_types(documents=List[Document]) + def run( + self, + sources: List[Union[str, Path, ByteStream]], + meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, + ) -> Dict[str, List[Document]]: + """ + Converts a XLSX file to a Document. + + :param sources: + List of file paths or ByteStream objects. + :param meta: + Optional metadata to attach to the documents. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced documents. + If it's a list, the length of the list must match the number of sources, because the two lists will + be zipped. + If `sources` contains ByteStream objects, their `meta` will be added to the output documents. + :returns: + A dictionary with the following keys: + - `documents`: Created documents + """ + documents = [] + + meta_list = normalize_metadata(meta, sources_count=len(sources)) + + for source, metadata in zip(sources, meta_list): + try: + bytestream = get_bytestream_from_source(source) + except Exception as e: + logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e) + continue + + try: + tables, tables_metadata = self._extract_tables(bytestream) + except Exception as e: + logger.warning( + "Could not read {source} and convert it to a Document, skipping. Error: {error}", + source=source, + error=e, + ) + continue + + # Loop over tables and create a Document for each table + for table, excel_metadata in zip(tables, tables_metadata): + merged_metadata = {**bytestream.meta, **metadata, **excel_metadata} + document = Document(content=table, meta=merged_metadata) + documents.append(document) + + return {"documents": documents} + + @staticmethod + def _generate_excel_column_names(n_cols: int) -> List[str]: + result = [] + for i in range(n_cols): + col_name = "" + num = i + while num >= 0: + col_name = chr(num % 26 + 65) + col_name + num = num // 26 - 1 + result.append(col_name) + return result + + def _extract_tables(self, bytestream: ByteStream) -> Tuple[List[str], List[Dict]]: + """ + Extract tables from a Excel file. + """ + resolved_read_excel_kwargs = { + **self.read_excel_kwargs, + "sheet_name": self.sheet_name, + "header": None, # Don't assign any pandas column labels + "engine": "openpyxl", # Use openpyxl as the engine to read the Excel file + } + sheet_to_dataframe = pd.read_excel(io=io.BytesIO(bytestream.data), **resolved_read_excel_kwargs) + if isinstance(sheet_to_dataframe, pd.DataFrame): + sheet_to_dataframe = {self.sheet_name: sheet_to_dataframe} + + updated_sheet_to_dataframe = {} + for key in sheet_to_dataframe: + df = sheet_to_dataframe[key] + # Row starts at 1 in Excel + df.index = df.index + 1 + # Excel column names are Alphabet Characters + header = self._generate_excel_column_names(df.shape[1]) + df.columns = header + updated_sheet_to_dataframe[key] = df + + tables = [] + metadata = [] + for key, value in updated_sheet_to_dataframe.items(): + if self.table_format == "csv": + resolved_kwargs = {"index": True, "header": True, "lineterminator": "\n", **self.table_format_kwargs} + tables.append(value.to_csv(**resolved_kwargs)) + else: + resolved_kwargs = { + "index": True, + "headers": value.columns, + "tablefmt": "pipe", + **self.table_format_kwargs, + } + # to_markdown uses tabulate + tables.append(value.to_markdown(**resolved_kwargs)) + # add sheet_name to metadata + metadata.append({"xlsx": {"sheet_name": key}}) + return tables, metadata diff --git a/pyproject.toml b/pyproject.toml index 6a76a2e9c0..73031b8130 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,7 +106,9 @@ extra-dependencies = [ "trafilatura", # HTMLToDocument "python-pptx", # PPTXToDocument "python-docx", # DocxToDocument - "jq", #JSONConverter + "jq", # JSONConverter + "openpyxl", # XLSXToDocument + "tabulate", # XLSXToDocument "nltk", # NLTKDocumentSplitter diff --git a/releasenotes/notes/add-excel-to-document-converter-1920c9f9902ddf17.yaml b/releasenotes/notes/add-excel-to-document-converter-1920c9f9902ddf17.yaml new file mode 100644 index 0000000000..7ae6ca0aca --- /dev/null +++ b/releasenotes/notes/add-excel-to-document-converter-1920c9f9902ddf17.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Add XLSXToDocument converter that loads an Excel file using Pandas + openpyxl and by default converts each sheet into a separate Document in a CSV format. diff --git a/test/components/converters/test_xlsx_to_document.py b/test/components/converters/test_xlsx_to_document.py new file mode 100644 index 0000000000..72964381bf --- /dev/null +++ b/test/components/converters/test_xlsx_to_document.py @@ -0,0 +1,139 @@ +import logging +from typing import Union + +import pytest + +from haystack.components.converters.xlsx import XLSXToDocument + + +class TestXLSXToDocument: + def test_init(self) -> None: + converter = XLSXToDocument() + assert converter.sheet_name is None + assert converter.read_excel_kwargs == {} + assert converter.table_format == "csv" + assert converter.table_format_kwargs == {} + + def test_run_basic_tables(self, test_files_path) -> None: + converter = XLSXToDocument() + paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"] + results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) + documents = results["documents"] + assert len(documents) == 2 + assert documents[0].content == ",A,B\n1,col_a,col_b\n2,1.5,test\n" + assert documents[0].meta == { + "date_added": "2022-01-01T00:00:00", + "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"), + "xlsx": {"sheet_name": "Basic Table"}, + } + assert documents[1].content == ",A,B\n1,col_c,col_d\n2,True,\n" + assert documents[1].meta == { + "date_added": "2022-01-01T00:00:00", + "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"), + "xlsx": {"sheet_name": "Table Missing Value"}, + } + + def test_run_table_empty_rows_and_columns(self, test_files_path) -> None: + converter = XLSXToDocument() + paths = [test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"] + results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) + documents = results["documents"] + assert len(documents) == 1 + assert documents[0].content == ",A,B,C\n1,,,\n2,,,\n3,,,\n4,,col_a,col_b\n5,,1.5,test\n" + assert documents[0].meta == { + "date_added": "2022-01-01T00:00:00", + "file_path": str(test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"), + "xlsx": {"sheet_name": "Sheet1"}, + } + + def test_run_multiple_tables_in_one_sheet(self, test_files_path) -> None: + converter = XLSXToDocument() + paths = [test_files_path / "xlsx" / "multiple_tables.xlsx"] + results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) + documents = results["documents"] + assert len(documents) == 1 + assert ( + documents[0].content + == ",A,B,C,D,E,F\n1,,,,,,\n2,,,,,,\n3,,col_a,col_b,,,\n4,,1.5,test,,col_c,col_d\n5,,,,,3,True\n" + ) + assert documents[0].meta == { + "date_added": "2022-01-01T00:00:00", + "file_path": str(test_files_path / "xlsx" / "multiple_tables.xlsx"), + "xlsx": {"sheet_name": "Sheet1"}, + } + + def test_run_markdown(self, test_files_path) -> None: + converter = XLSXToDocument(table_format="markdown") + paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"] + results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) + documents = results["documents"] + assert len(documents) == 2 + assert ( + documents[0].content + == "| | A | B |\n|---:|:------|:------|\n| 1 | col_a | col_b |\n| 2 | 1.5 | test |" + ) + assert documents[0].meta == { + "date_added": "2022-01-01T00:00:00", + "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"), + "xlsx": {"sheet_name": "Basic Table"}, + } + assert ( + documents[1].content + == "| | A | B |\n|---:|:------|:------|\n| 1 | col_c | col_d |\n| 2 | True | nan |" + ) + assert documents[1].meta == { + "date_added": "2022-01-01T00:00:00", + "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"), + "xlsx": {"sheet_name": "Table Missing Value"}, + } + + @pytest.mark.parametrize( + "sheet_name, expected_sheet_name, expected_content", + [ + ("Basic Table", "Basic Table", ",A,B\n1,col_a,col_b\n2,1.5,test\n"), + ("Table Missing Value", "Table Missing Value", ",A,B\n1,col_c,col_d\n2,True,\n"), + (0, 0, ",A,B\n1,col_a,col_b\n2,1.5,test\n"), + (1, 1, ",A,B\n1,col_c,col_d\n2,True,\n"), + ], + ) + def test_run_sheet_name( + self, sheet_name: Union[int, str], expected_sheet_name: str, expected_content: str, test_files_path + ) -> None: + converter = XLSXToDocument(sheet_name=sheet_name) + paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"] + results = converter.run(sources=paths) + documents = results["documents"] + assert len(documents) == 1 + assert documents[0].content == expected_content + assert documents[0].meta == { + "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"), + "xlsx": {"sheet_name": expected_sheet_name}, + } + + def test_run_with_read_excel_kwargs(self, test_files_path) -> None: + converter = XLSXToDocument(sheet_name="Basic Table", read_excel_kwargs={"skiprows": 1}) + paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"] + results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) + documents = results["documents"] + assert len(documents) == 1 + assert documents[0].content == ",A,B\n1,1.5,test\n" + assert documents[0].meta == { + "date_added": "2022-01-01T00:00:00", + "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"), + "xlsx": {"sheet_name": "Basic Table"}, + } + + def test_run_error_wrong_file_type(self, caplog: pytest.LogCaptureFixture, test_files_path) -> None: + converter = XLSXToDocument() + sources = [test_files_path / "pdf" / "sample_pdf_1.pdf"] + with caplog.at_level(logging.WARNING): + results = converter.run(sources=sources) + assert "sample_pdf_1.pdf and convert it" in caplog.text + assert results["documents"] == [] + + def test_run_error_non_existent_file(self, caplog: pytest.LogCaptureFixture) -> None: + converter = XLSXToDocument() + paths = ["non_existing_file.docx"] + with caplog.at_level(logging.WARNING): + converter.run(sources=paths) + assert "Could not read non_existing_file.docx" in caplog.text diff --git a/test/test_files/xlsx/basic_tables_two_sheets.xlsx b/test/test_files/xlsx/basic_tables_two_sheets.xlsx new file mode 100644 index 0000000000..15254ddb8c Binary files /dev/null and b/test/test_files/xlsx/basic_tables_two_sheets.xlsx differ diff --git a/test/test_files/xlsx/multiple_tables.xlsx b/test/test_files/xlsx/multiple_tables.xlsx new file mode 100644 index 0000000000..b3cd5ad343 Binary files /dev/null and b/test/test_files/xlsx/multiple_tables.xlsx differ diff --git a/test/test_files/xlsx/table_empty_rows_and_columns.xlsx b/test/test_files/xlsx/table_empty_rows_and_columns.xlsx new file mode 100644 index 0000000000..f599b8c336 Binary files /dev/null and b/test/test_files/xlsx/table_empty_rows_and_columns.xlsx differ