From 07048791aa072222e766d0c781b13672514e601a Mon Sep 17 00:00:00 2001 From: DanShatford <143026563+DanShatford@users.noreply.github.com> Date: Mon, 9 Oct 2023 19:19:03 +0100 Subject: [PATCH] feat: allow list of file paths in `convert_files_to_docs` (#5961) * feat: allow list of file paths in `convert_files_to_docs` * Fix validation * Fix check errors --- haystack/utils/preprocessing.py | 49 +++++++++++++------ ...t-files-to-docs-list-f75a057249ba8992.yaml | 6 +++ test/conftest.py | 5 ++ test/others/test_utils.py | 34 +++++++++++++ 4 files changed, 80 insertions(+), 14 deletions(-) create mode 100644 releasenotes/notes/issue-5616-convert-files-to-docs-list-f75a057249ba8992.yaml diff --git a/haystack/utils/preprocessing.py b/haystack/utils/preprocessing.py index 55b2cce9a3..9889e855ee 100644 --- a/haystack/utils/preprocessing.py +++ b/haystack/utils/preprocessing.py @@ -11,17 +11,20 @@ def convert_files_to_docs( - dir_path: str, + dir_path: Optional[str] = None, clean_func: Optional[Callable] = None, split_paragraphs: bool = False, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None, + file_paths: Optional[List[Path]] = None, ) -> List[Document]: """ - Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Documents that can be written to a - Document Store. + Convert files (.txt, .pdf, .docx) to Documents that can be written to a Document Store. - :param dir_path: The path of the directory containing the Files. + Files can be specified by giving a directory path, a list of file paths, or both. If a directory path is given then + all files with the allowed suffixes in the directory's subdirectories will be converted. + + :param dir_path: The path of a directory that contains Files to be converted, including in its subdirectories. :param clean_func: A custom cleaning function that gets applied to each Document (input: str, output: str). :param split_paragraphs: Whether to split text by paragraph. :param encoding: Character encoding to use when converting pdf documents. @@ -30,11 +33,18 @@ def convert_files_to_docs( To ensure you don't have duplicate Documents in your Document Store if texts are not unique, you can modify the metadata and pass [`"content"`, `"meta"`] to this field. If you do this, the Document ID will be generated by using the content and the defined metadata. + :param file_paths: A list of paths of Files to be converted. """ # Importing top-level causes a circular import from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter - file_paths = list(Path(dir_path).glob("**/*")) + if dir_path is None and file_paths is None: + raise ValueError("At least one of dir_path or file_paths must be set.") + if file_paths is None: + file_paths = [] + if dir_path is not None: + file_paths = file_paths + list(Path(dir_path).glob("**/*")) + allowed_suffixes = [".pdf", ".txt", ".docx"] suffix2converter: Dict[str, BaseConverter] = {} @@ -87,20 +97,23 @@ def convert_files_to_docs( def tika_convert_files_to_docs( - dir_path: str, + dir_path: Optional[str] = None, clean_func: Optional[Callable] = None, split_paragraphs: bool = False, merge_short: bool = True, merge_lowercase: bool = True, id_hash_keys: Optional[List[str]] = None, + file_paths: Optional[List[Path]] = None, ) -> List[Document]: """ - Convert all files (.txt, .pdf) in the sub-directories of the given path to Documents that can be written to a - Document Store. + Convert files (.txt, .pdf) to Documents that can be written to a Document Store. + + Files can be specified by giving a directory path, a list of file paths, or both. If a directory path is given then + all files with the allowed suffixes in the directory's subdirectories will be converted. :param merge_lowercase: Whether to convert merged paragraphs to lowercase. :param merge_short: Whether to allow merging of short paragraphs - :param dir_path: The path to the directory containing the files. + :param dir_path: The path of a directory that contains Files to be converted, including in its subdirectories. :param clean_func: A custom cleaning function that gets applied to each doc (input: str, output:str). :param split_paragraphs: Whether to split text by paragraphs. :param id_hash_keys: A list of Document attribute names from which the Document ID should be hashed from. @@ -108,6 +121,7 @@ def tika_convert_files_to_docs( To ensure you don't have duplicate Documents in your Document Store if texts are not unique, you can modify the metadata and pass [`"content"`, `"meta"`] to this field. If you do this, the Document ID will be generated by using the content and the defined metadata. + :param file_paths: A list of paths of Files to be converted. """ try: from haystack.nodes.file_converter import TikaConverter @@ -115,14 +129,21 @@ def tika_convert_files_to_docs( logger.error("Tika not installed. Please install tika and try again. Error: %s", ex) raise ex converter = TikaConverter() - paths = list(Path(dir_path).glob("**/*")) + + if dir_path is None and file_paths is None: + raise ValueError("At least one of dir_path or file_paths must be set.") + if file_paths is None: + file_paths = [] + if dir_path is not None: + file_paths = file_paths + list(Path(dir_path).glob("**/*")) + allowed_suffixes = [".pdf", ".txt"] - file_paths: List[Path] = [] + file_paths_to_convert: List[Path] = [] - for path in paths: + for path in file_paths: file_suffix = path.suffix.lower() if file_suffix in allowed_suffixes: - file_paths.append(path) + file_paths_to_convert.append(path) elif not path.is_dir(): logger.warning( "Skipped file %s as type %s is not supported here. " @@ -132,7 +153,7 @@ def tika_convert_files_to_docs( ) documents = [] - for path in file_paths: + for path in file_paths_to_convert: logger.info("Converting %s", path) # TikaConverter returns a list containing a single Document document = converter.convert(path)[0] diff --git a/releasenotes/notes/issue-5616-convert-files-to-docs-list-f75a057249ba8992.yaml b/releasenotes/notes/issue-5616-convert-files-to-docs-list-f75a057249ba8992.yaml new file mode 100644 index 0000000000..f1f05697db --- /dev/null +++ b/releasenotes/notes/issue-5616-convert-files-to-docs-list-f75a057249ba8992.yaml @@ -0,0 +1,6 @@ +--- +enhancements: + - | + Add `list_of_paths` argument to `utils.convert_files_to_docs` to allow + input of list of file paths to be converted, instead of, or as well as, + the current `dir_path` argument. diff --git a/test/conftest.py b/test/conftest.py index 77823ca2eb..630ddf4946 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -882,6 +882,11 @@ def samples_path(): return Path(__file__).parent / "samples" +@pytest.fixture +def sample_txt_file_paths_list(samples_path): + return list((samples_path / "docs").glob("*.txt")) + + @pytest.fixture def preview_samples_path(): return Path(__file__).parent / "preview" / "test_files" diff --git a/test/others/test_utils.py b/test/others/test_utils.py index ec4a64773a..9e0c86a99a 100644 --- a/test/others/test_utils.py +++ b/test/others/test_utils.py @@ -196,6 +196,32 @@ def test_convert_pdf_files_to_docs(samples_path): assert documents and len(documents) > 0 +@pytest.mark.unit +def test_convert_list_of_file_paths_to_docs(sample_txt_file_paths_list): + documents = convert_files_to_docs( + file_paths=sample_txt_file_paths_list, clean_func=clean_wiki_text, split_paragraphs=True + ) + assert documents and len(documents) > 0 + + +@pytest.mark.unit +def test_convert_dirpath_and_file_paths_list_to_docs(samples_path, sample_txt_file_paths_list): + docx_samples_path = samples_path / "docx" + documents = convert_files_to_docs( + dir_path=docx_samples_path, + file_paths=sample_txt_file_paths_list, + clean_func=clean_wiki_text, + split_paragraphs=True, + ) + assert documents and len(documents) > 0 + + +@pytest.mark.unit +def test_convert_with_no_dirpath_or_file_paths(): + with pytest.raises(ValueError): + convert_files_to_docs() + + @pytest.mark.unit def test_get_filename_extension_from_url_without_params_zip(): url = "http://www.mysite.com/resources/myfile.zip" @@ -226,6 +252,14 @@ def test_tika_convert_files_to_docs(samples_path): assert documents and len(documents) > 0 +@pytest.mark.tika +def test_tika_convert_list_of_file_paths_to_docs(sample_txt_file_paths_list): + documents = tika_convert_files_to_docs( + file_paths=sample_txt_file_paths_list, clean_func=clean_wiki_text, split_paragraphs=True + ) + assert documents and len(documents) > 0 + + @pytest.mark.unit def test_calculate_context_similarity_on_parts_of_whole_document(sample_context): whole_document = sample_context