Skip to content

Commit

Permalink
feat: allow list of file paths in convert_files_to_docs (#5961)
Browse files Browse the repository at this point in the history
* feat: allow list of file paths in `convert_files_to_docs`

* Fix validation

* Fix check errors
  • Loading branch information
DanShatford authored Oct 9, 2023
1 parent 13fb7c5 commit 0704879
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 14 deletions.
49 changes: 35 additions & 14 deletions haystack/utils/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,20 @@


def convert_files_to_docs(
dir_path: str,
dir_path: Optional[str] = None,
clean_func: Optional[Callable] = None,
split_paragraphs: bool = False,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None,
file_paths: Optional[List[Path]] = None,
) -> List[Document]:
"""
Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Documents that can be written to a
Document Store.
Convert files (.txt, .pdf, .docx) to Documents that can be written to a Document Store.
:param dir_path: The path of the directory containing the Files.
Files can be specified by giving a directory path, a list of file paths, or both. If a directory path is given then
all files with the allowed suffixes in the directory's subdirectories will be converted.
:param dir_path: The path of a directory that contains Files to be converted, including in its subdirectories.
:param clean_func: A custom cleaning function that gets applied to each Document (input: str, output: str).
:param split_paragraphs: Whether to split text by paragraph.
:param encoding: Character encoding to use when converting pdf documents.
Expand All @@ -30,11 +33,18 @@ def convert_files_to_docs(
To ensure you don't have duplicate Documents in your Document Store if texts are
not unique, you can modify the metadata and pass [`"content"`, `"meta"`] to this field.
If you do this, the Document ID will be generated by using the content and the defined metadata.
:param file_paths: A list of paths of Files to be converted.
"""
# Importing top-level causes a circular import
from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter

file_paths = list(Path(dir_path).glob("**/*"))
if dir_path is None and file_paths is None:
raise ValueError("At least one of dir_path or file_paths must be set.")
if file_paths is None:
file_paths = []
if dir_path is not None:
file_paths = file_paths + list(Path(dir_path).glob("**/*"))

allowed_suffixes = [".pdf", ".txt", ".docx"]
suffix2converter: Dict[str, BaseConverter] = {}

Expand Down Expand Up @@ -87,42 +97,53 @@ def convert_files_to_docs(


def tika_convert_files_to_docs(
dir_path: str,
dir_path: Optional[str] = None,
clean_func: Optional[Callable] = None,
split_paragraphs: bool = False,
merge_short: bool = True,
merge_lowercase: bool = True,
id_hash_keys: Optional[List[str]] = None,
file_paths: Optional[List[Path]] = None,
) -> List[Document]:
"""
Convert all files (.txt, .pdf) in the sub-directories of the given path to Documents that can be written to a
Document Store.
Convert files (.txt, .pdf) to Documents that can be written to a Document Store.
Files can be specified by giving a directory path, a list of file paths, or both. If a directory path is given then
all files with the allowed suffixes in the directory's subdirectories will be converted.
:param merge_lowercase: Whether to convert merged paragraphs to lowercase.
:param merge_short: Whether to allow merging of short paragraphs
:param dir_path: The path to the directory containing the files.
:param dir_path: The path of a directory that contains Files to be converted, including in its subdirectories.
:param clean_func: A custom cleaning function that gets applied to each doc (input: str, output:str).
:param split_paragraphs: Whether to split text by paragraphs.
:param id_hash_keys: A list of Document attribute names from which the Document ID should be hashed from.
Useful for generating unique IDs even if the Document contents are identical.
To ensure you don't have duplicate Documents in your Document Store if texts are
not unique, you can modify the metadata and pass [`"content"`, `"meta"`] to this field.
If you do this, the Document ID will be generated by using the content and the defined metadata.
:param file_paths: A list of paths of Files to be converted.
"""
try:
from haystack.nodes.file_converter import TikaConverter
except Exception as ex:
logger.error("Tika not installed. Please install tika and try again. Error: %s", ex)
raise ex
converter = TikaConverter()
paths = list(Path(dir_path).glob("**/*"))

if dir_path is None and file_paths is None:
raise ValueError("At least one of dir_path or file_paths must be set.")
if file_paths is None:
file_paths = []
if dir_path is not None:
file_paths = file_paths + list(Path(dir_path).glob("**/*"))

allowed_suffixes = [".pdf", ".txt"]
file_paths: List[Path] = []
file_paths_to_convert: List[Path] = []

for path in paths:
for path in file_paths:
file_suffix = path.suffix.lower()
if file_suffix in allowed_suffixes:
file_paths.append(path)
file_paths_to_convert.append(path)
elif not path.is_dir():
logger.warning(
"Skipped file %s as type %s is not supported here. "
Expand All @@ -132,7 +153,7 @@ def tika_convert_files_to_docs(
)

documents = []
for path in file_paths:
for path in file_paths_to_convert:
logger.info("Converting %s", path)
# TikaConverter returns a list containing a single Document
document = converter.convert(path)[0]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
enhancements:
- |
Add `list_of_paths` argument to `utils.convert_files_to_docs` to allow
input of list of file paths to be converted, instead of, or as well as,
the current `dir_path` argument.
5 changes: 5 additions & 0 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -882,6 +882,11 @@ def samples_path():
return Path(__file__).parent / "samples"


@pytest.fixture
def sample_txt_file_paths_list(samples_path):
return list((samples_path / "docs").glob("*.txt"))


@pytest.fixture
def preview_samples_path():
return Path(__file__).parent / "preview" / "test_files"
Expand Down
34 changes: 34 additions & 0 deletions test/others/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,32 @@ def test_convert_pdf_files_to_docs(samples_path):
assert documents and len(documents) > 0


@pytest.mark.unit
def test_convert_list_of_file_paths_to_docs(sample_txt_file_paths_list):
documents = convert_files_to_docs(
file_paths=sample_txt_file_paths_list, clean_func=clean_wiki_text, split_paragraphs=True
)
assert documents and len(documents) > 0


@pytest.mark.unit
def test_convert_dirpath_and_file_paths_list_to_docs(samples_path, sample_txt_file_paths_list):
docx_samples_path = samples_path / "docx"
documents = convert_files_to_docs(
dir_path=docx_samples_path,
file_paths=sample_txt_file_paths_list,
clean_func=clean_wiki_text,
split_paragraphs=True,
)
assert documents and len(documents) > 0


@pytest.mark.unit
def test_convert_with_no_dirpath_or_file_paths():
with pytest.raises(ValueError):
convert_files_to_docs()


@pytest.mark.unit
def test_get_filename_extension_from_url_without_params_zip():
url = "http://www.mysite.com/resources/myfile.zip"
Expand Down Expand Up @@ -226,6 +252,14 @@ def test_tika_convert_files_to_docs(samples_path):
assert documents and len(documents) > 0


@pytest.mark.tika
def test_tika_convert_list_of_file_paths_to_docs(sample_txt_file_paths_list):
documents = tika_convert_files_to_docs(
file_paths=sample_txt_file_paths_list, clean_func=clean_wiki_text, split_paragraphs=True
)
assert documents and len(documents) > 0


@pytest.mark.unit
def test_calculate_context_similarity_on_parts_of_whole_document(sample_context):
whole_document = sample_context
Expand Down

0 comments on commit 0704879

Please sign in to comment.