feat: allow list of file paths in convert_files_to_docs (#5961)

* feat: allow list of file paths in `convert_files_to_docs` * Fix validation * Fix check errors
deepset-ai · Oct 9, 2023 · 0704879 · 0704879
1 parent 13fb7c5
commit 0704879
Show file tree

Hide file tree

Showing 4 changed files with 80 additions and 14 deletions.
diff --git a/haystack/utils/preprocessing.py b/haystack/utils/preprocessing.py
@@ -11,17 +11,20 @@
 
 
 def convert_files_to_docs(
-    dir_path: str,
+    dir_path: Optional[str] = None,
     clean_func: Optional[Callable] = None,
     split_paragraphs: bool = False,
     encoding: Optional[str] = None,
     id_hash_keys: Optional[List[str]] = None,
+    file_paths: Optional[List[Path]] = None,
 ) -> List[Document]:
     """
-    Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Documents that can be written to a
-    Document Store.
+    Convert files (.txt, .pdf, .docx) to Documents that can be written to a Document Store.
 
-    :param dir_path: The path of the directory containing the Files.
+    Files can be specified by giving a directory path, a list of file paths, or both. If a directory path is given then
+    all files with the allowed suffixes in the directory's subdirectories will be converted.
+
+    :param dir_path: The path of a directory that contains Files to be converted, including in its subdirectories.
     :param clean_func: A custom cleaning function that gets applied to each Document (input: str, output: str).
     :param split_paragraphs: Whether to split text by paragraph.
     :param encoding: Character encoding to use when converting pdf documents.
@@ -30,11 +33,18 @@ def convert_files_to_docs(
             To ensure you don't have duplicate Documents in your Document Store if texts are
             not unique, you can modify the metadata and pass [`"content"`, `"meta"`] to this field.
             If you do this, the Document ID will be generated by using the content and the defined metadata.
+    :param file_paths: A list of paths of Files to be converted.
     """
     # Importing top-level causes a circular import
     from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
 
-    file_paths = list(Path(dir_path).glob("**/*"))
+    if dir_path is None and file_paths is None:
+        raise ValueError("At least one of dir_path or file_paths must be set.")
+    if file_paths is None:
+        file_paths = []
+    if dir_path is not None:
+        file_paths = file_paths + list(Path(dir_path).glob("**/*"))
+
     allowed_suffixes = [".pdf", ".txt", ".docx"]
     suffix2converter: Dict[str, BaseConverter] = {}
 
@@ -87,42 +97,53 @@ def convert_files_to_docs(
 
 
 def tika_convert_files_to_docs(
-    dir_path: str,
+    dir_path: Optional[str] = None,
     clean_func: Optional[Callable] = None,
     split_paragraphs: bool = False,
     merge_short: bool = True,
     merge_lowercase: bool = True,
     id_hash_keys: Optional[List[str]] = None,
+    file_paths: Optional[List[Path]] = None,
 ) -> List[Document]:
     """
-    Convert all files (.txt, .pdf) in the sub-directories of the given path to Documents that can be written to a
-    Document Store.
+    Convert files (.txt, .pdf) to Documents that can be written to a Document Store.
+
+    Files can be specified by giving a directory path, a list of file paths, or both. If a directory path is given then
+    all files with the allowed suffixes in the directory's subdirectories will be converted.
 
     :param merge_lowercase: Whether to convert merged paragraphs to lowercase.
     :param merge_short: Whether to allow merging of short paragraphs
-    :param dir_path: The path to the directory containing the files.
+    :param dir_path: The path of a directory that contains Files to be converted, including in its subdirectories.
     :param clean_func: A custom cleaning function that gets applied to each doc (input: str, output:str).
     :param split_paragraphs: Whether to split text by paragraphs.
     :param id_hash_keys: A list of Document attribute names from which the Document ID should be hashed from.
             Useful for generating unique IDs even if the Document contents are identical.
             To ensure you don't have duplicate Documents in your Document Store if texts are
             not unique, you can modify the metadata and pass [`"content"`, `"meta"`] to this field.
             If you do this, the Document ID will be generated by using the content and the defined metadata.
+    :param file_paths: A list of paths of Files to be converted.
     """
     try:
         from haystack.nodes.file_converter import TikaConverter
     except Exception as ex:
         logger.error("Tika not installed. Please install tika and try again. Error: %s", ex)
         raise ex
     converter = TikaConverter()
-    paths = list(Path(dir_path).glob("**/*"))
+
+    if dir_path is None and file_paths is None:
+        raise ValueError("At least one of dir_path or file_paths must be set.")
+    if file_paths is None:
+        file_paths = []
+    if dir_path is not None:
+        file_paths = file_paths + list(Path(dir_path).glob("**/*"))
+
     allowed_suffixes = [".pdf", ".txt"]
-    file_paths: List[Path] = []
+    file_paths_to_convert: List[Path] = []
 
-    for path in paths:
+    for path in file_paths:
         file_suffix = path.suffix.lower()
         if file_suffix in allowed_suffixes:
-            file_paths.append(path)
+            file_paths_to_convert.append(path)
         elif not path.is_dir():
             logger.warning(
                 "Skipped file %s as type %s is not supported here. "
@@ -132,7 +153,7 @@ def tika_convert_files_to_docs(
             )
 
     documents = []
-    for path in file_paths:
+    for path in file_paths_to_convert:
         logger.info("Converting %s", path)
         # TikaConverter returns a list containing a single Document
         document = converter.convert(path)[0]

diff --git a/releasenotes/notes/issue-5616-convert-files-to-docs-list-f75a057249ba8992.yaml b/releasenotes/notes/issue-5616-convert-files-to-docs-list-f75a057249ba8992.yaml
@@ -0,0 +1,6 @@
+---
+enhancements:
+  - |
+    Add `list_of_paths` argument to `utils.convert_files_to_docs` to allow 
+    input of list of file paths to be converted, instead of, or as well as, 
+    the current `dir_path` argument.
diff --git a/test/conftest.py b/test/conftest.py
@@ -882,6 +882,11 @@ def samples_path():
     return Path(__file__).parent / "samples"
 
 
+@pytest.fixture
+def sample_txt_file_paths_list(samples_path):
+    return list((samples_path / "docs").glob("*.txt"))
+
+
 @pytest.fixture
 def preview_samples_path():
     return Path(__file__).parent / "preview" / "test_files"

diff --git a/test/others/test_utils.py b/test/others/test_utils.py
@@ -196,6 +196,32 @@ def test_convert_pdf_files_to_docs(samples_path):
     assert documents and len(documents) > 0
 
 
+@pytest.mark.unit
+def test_convert_list_of_file_paths_to_docs(sample_txt_file_paths_list):
+    documents = convert_files_to_docs(
+        file_paths=sample_txt_file_paths_list, clean_func=clean_wiki_text, split_paragraphs=True
+    )
+    assert documents and len(documents) > 0
+
+
+@pytest.mark.unit
+def test_convert_dirpath_and_file_paths_list_to_docs(samples_path, sample_txt_file_paths_list):
+    docx_samples_path = samples_path / "docx"
+    documents = convert_files_to_docs(
+        dir_path=docx_samples_path,
+        file_paths=sample_txt_file_paths_list,
+        clean_func=clean_wiki_text,
+        split_paragraphs=True,
+    )
+    assert documents and len(documents) > 0
+
+
+@pytest.mark.unit
+def test_convert_with_no_dirpath_or_file_paths():
+    with pytest.raises(ValueError):
+        convert_files_to_docs()
+
+
 @pytest.mark.unit
 def test_get_filename_extension_from_url_without_params_zip():
     url = "http://www.mysite.com/resources/myfile.zip"
@@ -226,6 +252,14 @@ def test_tika_convert_files_to_docs(samples_path):
     assert documents and len(documents) > 0
 
 
+@pytest.mark.tika
+def test_tika_convert_list_of_file_paths_to_docs(sample_txt_file_paths_list):
+    documents = tika_convert_files_to_docs(
+        file_paths=sample_txt_file_paths_list, clean_func=clean_wiki_text, split_paragraphs=True
+    )
+    assert documents and len(documents) > 0
+
+
 @pytest.mark.unit
 def test_calculate_context_similarity_on_parts_of_whole_document(sample_context):
     whole_document = sample_context