From e87b131e13318503670768d538703b4a65df016f Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Thu, 5 Oct 2023 12:12:20 +0200 Subject: [PATCH 1/9] remove whitespaces, substrings, regex, empty lines --- .../components/preprocessors/__init__.py | 3 +- .../preprocessors/text_document_cleaner.py | 118 ++++++++++++++++++ .../test_text_document_cleaner.py | 79 ++++++++++++ 3 files changed, 199 insertions(+), 1 deletion(-) create mode 100644 haystack/preview/components/preprocessors/text_document_cleaner.py create mode 100644 test/preview/components/preprocessors/test_text_document_cleaner.py diff --git a/haystack/preview/components/preprocessors/__init__.py b/haystack/preview/components/preprocessors/__init__.py index 33a0e2cd18..a4b48ecd30 100644 --- a/haystack/preview/components/preprocessors/__init__.py +++ b/haystack/preview/components/preprocessors/__init__.py @@ -1,3 +1,4 @@ +from haystack.preview.components.preprocessors.text_document_cleaner import TextDocumentCleaner from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter -__all__ = ["TextDocumentSplitter"] +__all__ = ["TextDocumentSplitter", "TextDocumentCleaner"] diff --git a/haystack/preview/components/preprocessors/text_document_cleaner.py b/haystack/preview/components/preprocessors/text_document_cleaner.py new file mode 100644 index 0000000000..93433c0384 --- /dev/null +++ b/haystack/preview/components/preprocessors/text_document_cleaner.py @@ -0,0 +1,118 @@ +import re +from copy import deepcopy +from typing import Any, Dict, List, Optional + +from haystack.preview import Document, component, default_from_dict, default_to_dict + + +@component +class TextDocumentCleaner: + """ + Makes text documents more readable by cleaning empty lines, extra whitespaces, headers and footers, etc. + This is useful for preparing the documents for further processing by LLMs. + """ + + def __init__( + self, + remove_empty_lines: bool = True, + remove_extra_whitespaces: bool = True, + remove_repeated_substrings: bool = False, + remove_substrings: Optional[List[str]] = None, + remove_regex: Optional[str] = None, + ): + """ + :param remove_empty_lines: Whether to remove empty lines. + :param remove_extra_whitespaces: Whether to remove extra whitespaces. + :param remove_repeated_substrings: Whether to remove repeated substrings, such as headers and footers. + :param remove_substrings: List of substrings to remove from the text. + :param remove_regex: Regex to match and replace substrings by "". + """ + + self.remove_empty_lines = remove_empty_lines + self.remove_extra_whitespaces = remove_extra_whitespaces + self.remove_repeated_substrings = remove_repeated_substrings + self.remove_substrings = remove_substrings + self.remove_regex = remove_regex + + @component.output_types(documents=List[Document]) + def run(self, documents: List[Document]): + if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): + raise TypeError("TextDocumentCleaner expects a List of Documents as input.") + + cleaned_docs = [] + for doc in documents: + if doc.text is None: + raise ValueError( + f"TextDocumentCleaner only works with text documents but document.text for document ID {doc.id} is None." + ) + text = doc.text + + if self.remove_empty_lines: + text = self._remove_empty_lines(text) + if self.remove_extra_whitespaces: + text = self._remove_extra_whitespaces(text) + if self.remove_repeated_substrings: + text = self._remove_repeated_substrings(text) + if self.remove_substrings: + text = self._remove_substrings(text, self.remove_substrings) + if self.remove_regex: + text = self._remove_regex(text, self.remove_regex) + + cleaned_docs.append(Document(text=text, metadata=deepcopy(doc.metadata))) + + return {"documents": cleaned_docs} + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict( + self, + clean_empty_lines=self.remove_empty_lines, + clean_whitespaces=self.remove_extra_whitespaces, + clean_repeated_substrings=self.remove_repeated_substrings, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "TextDocumentCleaner": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + def _remove_empty_lines(self, text: str) -> str: + """ + Remove empty lines and lines that contain nothing but whitespaces from text. + :param text: Text to clean. + """ + lines = text.split("\n") + non_empty_lines = filter(lambda line: line.strip() != "", lines) + return "\n".join(non_empty_lines) + + def _remove_extra_whitespaces(self, text: str) -> str: + """ + Remove extra whitespaces from text. + :param text: Text to clean. + """ + return re.sub(r"\s\s+", " ", text).strip() + + def _remove_regex(self, text: str, regex: str) -> str: + """ + Remove substrings that match the specified regex from the text. + :param text: Text to clean. + :param regex: Regex to match and replace substrings by "". + """ + return re.sub(regex, "", text).strip() + + def _remove_substrings(self, text: str, substrings: List[str]) -> str: + """ + Remove all specified substrings from the text. + :param text: Text to clean. + :param substrings: Substrings to remove. + """ + for substring in substrings: + text = text.replace(substring, "") + return text + + def _remove_repeated_substrings(self, text: str) -> str: + return text diff --git a/test/preview/components/preprocessors/test_text_document_cleaner.py b/test/preview/components/preprocessors/test_text_document_cleaner.py new file mode 100644 index 0000000000..07681f7bbf --- /dev/null +++ b/test/preview/components/preprocessors/test_text_document_cleaner.py @@ -0,0 +1,79 @@ +import pytest + +from haystack.preview import Document +from haystack.preview.components.preprocessors import TextDocumentCleaner + + +class TestTextDocumentCleaner: + @pytest.mark.unit + def test_non_text_document(self): + with pytest.raises( + ValueError, match="TextDocumentCleaner only works with text documents but document.text for document ID" + ): + cleaner = TextDocumentCleaner() + cleaner.run(documents=[Document()]) + + @pytest.mark.unit + def test_single_doc(self): + with pytest.raises(TypeError, match="TextDocumentCleaner expects a List of Documents as input."): + cleaner = TextDocumentCleaner() + cleaner.run(documents=Document()) + + @pytest.mark.unit + def test_empty_list(self): + cleaner = TextDocumentCleaner() + result = cleaner.run(documents=[]) + assert result == {"documents": []} + + @pytest.mark.unit + def test_clean_empty_lines(self): + cleaner = TextDocumentCleaner(remove_extra_whitespaces=False) + result = cleaner.run( + documents=[ + Document( + text="This is a text with some words. " + "" + "There is a second sentence. " + "" + "And there is a third sentence." + ) + ] + ) + assert len(result["documents"]) == 1 + assert ( + result["documents"][0].text + == "This is a text with some words. There is a second sentence. And there is a third sentence." + ) + + @pytest.mark.unit + def test_clean_whitespaces(self): + cleaner = TextDocumentCleaner(clean_empty_lines=False) + result = cleaner.run( + documents=[ + Document( + text=" This is a text with some words. " + "" + "There is a second sentence. " + "" + "And there is a third sentence. " + ) + ] + ) + assert len(result["documents"]) == 1 + assert result["documents"][0].text == ( + "This is a text with some words. " "" "There is a second sentence. " "" "And there is a third sentence." + ) + + @pytest.mark.unit + def test_remove_substrings(self): + cleaner = TextDocumentCleaner(remove_substrings=["This", "A", "words"]) + result = cleaner.run(documents=[Document(text="This is a text with some words.")]) + assert len(result["documents"]) == 1 + assert result["documents"][0].text == (" is a text with some .") + + @pytest.mark.unit + def test_remove_regex(self): + cleaner = TextDocumentCleaner(remove_regex=r"\s\s+") + result = cleaner.run(documents=[Document(text="This is a text with some words.")]) + assert len(result["documents"]) == 1 + assert result["documents"][0].text == ("This is a text with some words.") From 813ee6b68ba27e22a8a707745ec375696b0465a1 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Thu, 5 Oct 2023 17:19:07 +0200 Subject: [PATCH 2/9] remove repeated substrings --- .../preprocessors/text_document_cleaner.py | 89 ++++++++++++++++++- .../test_text_document_cleaner.py | 48 ++++++++-- 2 files changed, 130 insertions(+), 7 deletions(-) diff --git a/haystack/preview/components/preprocessors/text_document_cleaner.py b/haystack/preview/components/preprocessors/text_document_cleaner.py index 93433c0384..fc7c0e6ea7 100644 --- a/haystack/preview/components/preprocessors/text_document_cleaner.py +++ b/haystack/preview/components/preprocessors/text_document_cleaner.py @@ -1,6 +1,8 @@ import re from copy import deepcopy -from typing import Any, Dict, List, Optional +from functools import partial, reduce +from itertools import chain +from typing import Any, Dict, List, Optional, Generator, Set from haystack.preview import Document, component, default_from_dict, default_to_dict @@ -115,4 +117,89 @@ def _remove_substrings(self, text: str, substrings: List[str]) -> str: return text def _remove_repeated_substrings(self, text: str) -> str: + return self._find_and_remove_header_footer( + text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1 + ) + + def _find_and_remove_header_footer( + self, text: str, n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int + ) -> str: + """ + Heuristic to find footers and headers across different pages by searching for the longest common string. + For headers, we only search in the first n_chars characters (for footer: last n_chars). + Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX", + but won't detect "Page 3 of 4" or similar. + + :param n_chars: number of first/last characters where the header/footer shall be searched in + :param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header) + :param n_last_pages_to_ignore: number of last pages to ignore + :return: (cleaned pages, found_header_str, found_footer_str) + """ + + pages = text.split("\f") + + # header + start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]] + found_header = self._find_longest_common_ngram(start_of_pages) + if found_header: + pages = [page.replace(found_header, "") for page in pages] + + # footer + end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]] + found_footer = self._find_longest_common_ngram(end_of_pages) + if found_footer: + pages = [page.replace(found_footer, "") for page in pages] + # logger.debug("Removed header '%s' and footer '%s' in document", found_header, found_footer) + text = "\f".join(pages) return text + + def _ngram(self, seq: str, n: int) -> Generator[str, None, None]: + """ + Return ngram (of tokens - currently split by whitespace) + :param seq: str, string from which the ngram shall be created + :param n: int, n of ngram + :return: str, ngram as string + """ + + # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization, + # we add a space here and remove it after creation of the ngrams again (see below) + seq = seq.replace("\n", " \n") + seq = seq.replace("\t", " \t") + + words = seq.split(" ") + ngrams = ( + " ".join(words[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1) + ) + + return ngrams + + def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]: + lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq)) + ngrams = map(partial(self._ngram, seq), lengths) + res = set(chain.from_iterable(ngrams)) + return res + + def _find_longest_common_ngram( + self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3 + ) -> Optional[str]: + """ + Find the longest common ngram across different text sequences (e.g. start of pages). + Considering all ngrams between the specified range. Helpful for finding footers, headers etc. + + :param sequences: list[str], list of strings that shall be searched for common n_grams + :param max_ngram: int, maximum length of ngram to consider + :param min_ngram: minimum length of ngram to consider + :return: str, common string of all sections + """ + sequences = [s for s in sequences if s] # filter empty sequences + if not sequences: + return None + seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences) + intersection = reduce(set.intersection, seqs_ngrams) + + try: + longest = max(intersection, key=len) + except ValueError: + # no common sequence found + longest = "" + return longest if longest.strip() else None diff --git a/test/preview/components/preprocessors/test_text_document_cleaner.py b/test/preview/components/preprocessors/test_text_document_cleaner.py index 07681f7bbf..7f012af70e 100644 --- a/test/preview/components/preprocessors/test_text_document_cleaner.py +++ b/test/preview/components/preprocessors/test_text_document_cleaner.py @@ -14,7 +14,7 @@ def test_non_text_document(self): cleaner.run(documents=[Document()]) @pytest.mark.unit - def test_single_doc(self): + def test_single_document(self): with pytest.raises(TypeError, match="TextDocumentCleaner expects a List of Documents as input."): cleaner = TextDocumentCleaner() cleaner.run(documents=Document()) @@ -26,7 +26,7 @@ def test_empty_list(self): assert result == {"documents": []} @pytest.mark.unit - def test_clean_empty_lines(self): + def test_remove_empty_lines(self): cleaner = TextDocumentCleaner(remove_extra_whitespaces=False) result = cleaner.run( documents=[ @@ -46,8 +46,8 @@ def test_clean_empty_lines(self): ) @pytest.mark.unit - def test_clean_whitespaces(self): - cleaner = TextDocumentCleaner(clean_empty_lines=False) + def test_remove_whitespaces(self): + cleaner = TextDocumentCleaner(remove_empty_lines=False) result = cleaner.run( documents=[ Document( @@ -69,11 +69,47 @@ def test_remove_substrings(self): cleaner = TextDocumentCleaner(remove_substrings=["This", "A", "words"]) result = cleaner.run(documents=[Document(text="This is a text with some words.")]) assert len(result["documents"]) == 1 - assert result["documents"][0].text == (" is a text with some .") + assert result["documents"][0].text == " is a text with some ." @pytest.mark.unit def test_remove_regex(self): cleaner = TextDocumentCleaner(remove_regex=r"\s\s+") result = cleaner.run(documents=[Document(text="This is a text with some words.")]) assert len(result["documents"]) == 1 - assert result["documents"][0].text == ("This is a text with some words.") + assert result["documents"][0].text == "This is a text with some words." + + @pytest.mark.unit + def test_remove_repeated_substrings(self): + cleaner = TextDocumentCleaner( + remove_empty_lines=False, remove_extra_whitespaces=False, remove_repeated_substrings=True + ) + + text = """First Page This is a header. + Page of + 2 + 4 + Lorem ipsum dolor sit amet + This is a footer number 1 + This is footer number 2 This is a header. + Page of + 3 + 4 + Sid ut perspiciatis unde + This is a footer number 1 + This is footer number 2 This is a header. + Page of + 4 + 4 + Sed do eiusmod tempor. + This is a footer number 1 + This is footer number 2""" + + expected_text = """First Page 2 + 4 + Lorem ipsum dolor sit amet 3 + 4 + Sid ut perspiciatis unde 4 + 4 + Sed do eiusmod tempor.""" + result = cleaner.run(documents=[Document(text=text)]) + assert result["documents"][0].text == expected_text From c5ff40958ab546534115c8ccd68ca30ebfc6e888 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Thu, 5 Oct 2023 17:23:20 +0200 Subject: [PATCH 3/9] reno --- .../components/preprocessors/text_document_cleaner.py | 2 +- .../notes/text-document-cleaner-8afce831a2ac31ae.yaml | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/text-document-cleaner-8afce831a2ac31ae.yaml diff --git a/haystack/preview/components/preprocessors/text_document_cleaner.py b/haystack/preview/components/preprocessors/text_document_cleaner.py index fc7c0e6ea7..2507cf5772 100644 --- a/haystack/preview/components/preprocessors/text_document_cleaner.py +++ b/haystack/preview/components/preprocessors/text_document_cleaner.py @@ -2,7 +2,7 @@ from copy import deepcopy from functools import partial, reduce from itertools import chain -from typing import Any, Dict, List, Optional, Generator, Set +from typing import Any, Dict, Generator, List, Optional, Set from haystack.preview import Document, component, default_from_dict, default_to_dict diff --git a/releasenotes/notes/text-document-cleaner-8afce831a2ac31ae.yaml b/releasenotes/notes/text-document-cleaner-8afce831a2ac31ae.yaml new file mode 100644 index 0000000000..f15d8d82f5 --- /dev/null +++ b/releasenotes/notes/text-document-cleaner-8afce831a2ac31ae.yaml @@ -0,0 +1,5 @@ +--- +preview: + - | + Add TextDocumentCleaner, which removes extra whitespace, empty lines, headers, etc. from Documents containing text. + Useful as a preprocessing step before splitting into shorter text documents. From c792b58a3e18c31bf83014aa9829e8e0b66a1802 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 9 Oct 2023 11:56:45 +0200 Subject: [PATCH 4/9] return empty string as shortest common ngram --- .../preprocessors/text_document_cleaner.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/haystack/preview/components/preprocessors/text_document_cleaner.py b/haystack/preview/components/preprocessors/text_document_cleaner.py index 2507cf5772..987e1dbf4a 100644 --- a/haystack/preview/components/preprocessors/text_document_cleaner.py +++ b/haystack/preview/components/preprocessors/text_document_cleaner.py @@ -179,9 +179,7 @@ def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]: res = set(chain.from_iterable(ngrams)) return res - def _find_longest_common_ngram( - self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3 - ) -> Optional[str]: + def _find_longest_common_ngram(self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3) -> str: """ Find the longest common ngram across different text sequences (e.g. start of pages). Considering all ngrams between the specified range. Helpful for finding footers, headers etc. @@ -193,13 +191,9 @@ def _find_longest_common_ngram( """ sequences = [s for s in sequences if s] # filter empty sequences if not sequences: - return None + return "" seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences) intersection = reduce(set.intersection, seqs_ngrams) - try: - longest = max(intersection, key=len) - except ValueError: - # no common sequence found - longest = "" - return longest if longest.strip() else None + longest = max(intersection, key=len, default="") + return longest if longest.strip() else "" From f202acf68260634538a14982de2fd49f1dbd35ea Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Thu, 12 Oct 2023 23:20:24 +0200 Subject: [PATCH 5/9] address first half of review feedback --- .../preprocessors/text_document_cleaner.py | 31 +++++++++--- .../test_text_document_cleaner.py | 49 +++++++++++++++++-- 2 files changed, 70 insertions(+), 10 deletions(-) diff --git a/haystack/preview/components/preprocessors/text_document_cleaner.py b/haystack/preview/components/preprocessors/text_document_cleaner.py index 987e1dbf4a..89cc3a601e 100644 --- a/haystack/preview/components/preprocessors/text_document_cleaner.py +++ b/haystack/preview/components/preprocessors/text_document_cleaner.py @@ -1,3 +1,4 @@ +import logging import re from copy import deepcopy from functools import partial, reduce @@ -6,12 +7,25 @@ from haystack.preview import Document, component, default_from_dict, default_to_dict +logger = logging.getLogger(__name__) + @component class TextDocumentCleaner: """ Makes text documents more readable by cleaning empty lines, extra whitespaces, headers and footers, etc. This is useful for preparing the documents for further processing by LLMs. + + Example usage in an indexing pipeline: + document_store = MemoryDocumentStore() + p = Pipeline() + p.add_component(instance=TextFileToDocument(), name="text_file_converter") + p.add_component(instance=TextDocumentCleaner(), name="cleaner") + p.add_component(instance=TextDocumentSplitter(split_by="sentence", split_length=1), name="splitter") + p.add_component(instance=DocumentWriter(document_store=document_store), name="writer") + p.connect("text_file_converter.documents", "cleaner.documents") + p.connect("cleaner.documents", "splitter.documents") + p.connect("splitter.documents", "writer.documents") """ def __init__( @@ -44,9 +58,12 @@ def run(self, documents: List[Document]): cleaned_docs = [] for doc in documents: if doc.text is None: - raise ValueError( - f"TextDocumentCleaner only works with text documents but document.text for document ID {doc.id} is None." + logger.warning( + "TextDocumentCleaner only works with text documents but document.text for document ID %s is None.", + doc.id, ) + cleaned_docs.append(doc) + continue text = doc.text if self.remove_empty_lines: @@ -70,9 +87,11 @@ def to_dict(self) -> Dict[str, Any]: """ return default_to_dict( self, - clean_empty_lines=self.remove_empty_lines, - clean_whitespaces=self.remove_extra_whitespaces, - clean_repeated_substrings=self.remove_repeated_substrings, + remove_empty_lines=self.remove_empty_lines, + remove_extra_whitespaces=self.remove_extra_whitespaces, + remove_repeated_substrings=self.remove_repeated_substrings, + remove_substrings=self.remove_substrings, + remove_regex=self.remove_regex, ) @classmethod @@ -133,7 +152,7 @@ def _find_and_remove_header_footer( :param n_chars: number of first/last characters where the header/footer shall be searched in :param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header) :param n_last_pages_to_ignore: number of last pages to ignore - :return: (cleaned pages, found_header_str, found_footer_str) + :return: cleaned text """ pages = text.split("\f") diff --git a/test/preview/components/preprocessors/test_text_document_cleaner.py b/test/preview/components/preprocessors/test_text_document_cleaner.py index 7f012af70e..565ec46dd9 100644 --- a/test/preview/components/preprocessors/test_text_document_cleaner.py +++ b/test/preview/components/preprocessors/test_text_document_cleaner.py @@ -1,3 +1,5 @@ +import logging + import pytest from haystack.preview import Document @@ -6,12 +8,51 @@ class TestTextDocumentCleaner: @pytest.mark.unit - def test_non_text_document(self): - with pytest.raises( - ValueError, match="TextDocumentCleaner only works with text documents but document.text for document ID" - ): + def test_to_dict(self): + component = TextDocumentCleaner( + remove_empty_lines=False, + remove_extra_whitespaces=False, + remove_repeated_substrings=True, + remove_substrings=["a", "b"], + remove_regex=r"\s\s+", + ) + data = component.to_dict() + assert data == { + "type": "TextDocumentCleaner", + "init_parameters": { + "remove_empty_lines": False, + "remove_extra_whitespaces": False, + "remove_repeated_substrings": True, + "remove_substrings": ["a", "b"], + "remove_regex": r"\s\s+", + }, + } + + @pytest.mark.unit + def test_from_dict(self): + data = { + "type": "TextDocumentCleaner", + "init_parameters": { + "remove_empty_lines": False, + "remove_extra_whitespaces": False, + "remove_repeated_substrings": True, + "remove_substrings": ["a", "b"], + "remove_regex": r"\s\s+", + }, + } + component = TextDocumentCleaner.from_dict(data) + assert component.remove_empty_lines == False + assert component.remove_extra_whitespaces == False + assert component.remove_repeated_substrings == True + assert component.remove_substrings == ["a", "b"] + assert component.remove_regex == r"\s\s+" + + @pytest.mark.unit + def test_non_text_document(self, caplog): + with caplog.at_level(logging.WARNING): cleaner = TextDocumentCleaner() cleaner.run(documents=[Document()]) + assert "TextDocumentCleaner only works with text documents but document.text for document ID" in caplog.text @pytest.mark.unit def test_single_document(self): From 215da3d43c3b20859d61315836f8586765d816cf Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 13 Oct 2023 08:46:16 +0200 Subject: [PATCH 6/9] address second half of review feedback --- .../components/preprocessors/__init__.py | 4 +- .../preprocessors/text_document_cleaner.py | 76 ++++++++++++------- ...ext-document-cleaner-8afce831a2ac31ae.yaml | 2 +- .../test_text_document_cleaner.py | 70 +++++++++++------ 4 files changed, 99 insertions(+), 53 deletions(-) diff --git a/haystack/preview/components/preprocessors/__init__.py b/haystack/preview/components/preprocessors/__init__.py index a4b48ecd30..3024c7f7e9 100644 --- a/haystack/preview/components/preprocessors/__init__.py +++ b/haystack/preview/components/preprocessors/__init__.py @@ -1,4 +1,4 @@ -from haystack.preview.components.preprocessors.text_document_cleaner import TextDocumentCleaner +from haystack.preview.components.preprocessors.text_document_cleaner import DocumentCleaner from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter -__all__ = ["TextDocumentSplitter", "TextDocumentCleaner"] +__all__ = ["TextDocumentSplitter", "DocumentCleaner"] diff --git a/haystack/preview/components/preprocessors/text_document_cleaner.py b/haystack/preview/components/preprocessors/text_document_cleaner.py index 89cc3a601e..854e1d6d71 100644 --- a/haystack/preview/components/preprocessors/text_document_cleaner.py +++ b/haystack/preview/components/preprocessors/text_document_cleaner.py @@ -11,16 +11,16 @@ @component -class TextDocumentCleaner: +class DocumentCleaner: """ - Makes text documents more readable by cleaning empty lines, extra whitespaces, headers and footers, etc. + Makes text documents more readable by removing extra whitespaces, empty lines, specified substrings, regexes, headers and footers (in this order). This is useful for preparing the documents for further processing by LLMs. Example usage in an indexing pipeline: document_store = MemoryDocumentStore() p = Pipeline() p.add_component(instance=TextFileToDocument(), name="text_file_converter") - p.add_component(instance=TextDocumentCleaner(), name="cleaner") + p.add_component(instance=DocumentCleaner(), name="cleaner") p.add_component(instance=TextDocumentSplitter(split_by="sentence", split_length=1), name="splitter") p.add_component(instance=DocumentWriter(document_store=document_store), name="writer") p.connect("text_file_converter.documents", "cleaner.documents") @@ -52,30 +52,32 @@ def __init__( @component.output_types(documents=List[Document]) def run(self, documents: List[Document]): + """ + Run the DocumentCleaner on the given list of documents + """ if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): - raise TypeError("TextDocumentCleaner expects a List of Documents as input.") + raise TypeError("DocumentCleaner expects a List of Documents as input.") cleaned_docs = [] for doc in documents: if doc.text is None: logger.warning( - "TextDocumentCleaner only works with text documents but document.text for document ID %s is None.", - doc.id, + "DocumentCleaner only cleans text documents but document.text for document ID %s is None.", doc.id ) cleaned_docs.append(doc) continue text = doc.text - if self.remove_empty_lines: - text = self._remove_empty_lines(text) if self.remove_extra_whitespaces: text = self._remove_extra_whitespaces(text) - if self.remove_repeated_substrings: - text = self._remove_repeated_substrings(text) + if self.remove_empty_lines: + text = self._remove_empty_lines(text) if self.remove_substrings: text = self._remove_substrings(text, self.remove_substrings) if self.remove_regex: text = self._remove_regex(text, self.remove_regex) + if self.remove_repeated_substrings: + text = self._remove_repeated_substrings(text) cleaned_docs.append(Document(text=text, metadata=deepcopy(doc.metadata))) @@ -95,7 +97,7 @@ def to_dict(self) -> Dict[str, Any]: ) @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "TextDocumentCleaner": + def from_dict(cls, data: Dict[str, Any]) -> "DocumentCleaner": """ Deserialize this component from a dictionary. """ @@ -105,6 +107,7 @@ def _remove_empty_lines(self, text: str) -> str: """ Remove empty lines and lines that contain nothing but whitespaces from text. :param text: Text to clean. + :param return: The text without empty lines. """ lines = text.split("\n") non_empty_lines = filter(lambda line: line.strip() != "", lines) @@ -114,6 +117,7 @@ def _remove_extra_whitespaces(self, text: str) -> str: """ Remove extra whitespaces from text. :param text: Text to clean. + :param return: The text without extra whitespaces. """ return re.sub(r"\s\s+", " ", text).strip() @@ -122,6 +126,7 @@ def _remove_regex(self, text: str, regex: str) -> str: Remove substrings that match the specified regex from the text. :param text: Text to clean. :param regex: Regex to match and replace substrings by "". + :param return: The text without any substrings that match the regex. """ return re.sub(regex, "", text).strip() @@ -130,12 +135,18 @@ def _remove_substrings(self, text: str, substrings: List[str]) -> str: Remove all specified substrings from the text. :param text: Text to clean. :param substrings: Substrings to remove. + :return: The text without the specified substrings. """ for substring in substrings: text = text.replace(substring, "") return text def _remove_repeated_substrings(self, text: str) -> str: + """ + Remove any substrings from the text that occur repeatedly. For example headers or footers. + :param text: Text to clean. + :return: The text without the repeated substrings. + """ return self._find_and_remove_header_footer( text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1 ) @@ -149,10 +160,10 @@ def _find_and_remove_header_footer( Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4" or similar. - :param n_chars: number of first/last characters where the header/footer shall be searched in - :param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header) - :param n_last_pages_to_ignore: number of last pages to ignore - :return: cleaned text + :param n_chars: The number of first/last characters where the header/footer shall be searched in. + :param n_first_pages_to_ignore: The number of first pages to ignore (e.g. TOCs often don't contain footer/header). + :param n_last_pages_to_ignore: The number of last pages to ignore. + :return: The text without the found headers and footers. """ pages = text.split("\f") @@ -168,16 +179,17 @@ def _find_and_remove_header_footer( found_footer = self._find_longest_common_ngram(end_of_pages) if found_footer: pages = [page.replace(found_footer, "") for page in pages] - # logger.debug("Removed header '%s' and footer '%s' in document", found_header, found_footer) + + logger.debug("Removed header '%s' and footer '%s' in document", found_header, found_footer) text = "\f".join(pages) return text def _ngram(self, seq: str, n: int) -> Generator[str, None, None]: """ - Return ngram (of tokens - currently split by whitespace) - :param seq: str, string from which the ngram shall be created - :param n: int, n of ngram - :return: str, ngram as string + Return all ngrams of length n from a text sequence. Each ngram consists of n words split by whitespace. + :param seq: The sequence to generate ngrams from. + :param n: The length of the ngrams to generate. + :return: A Generator generating all ngrams of length n from the given sequence. """ # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization, @@ -193,20 +205,30 @@ def _ngram(self, seq: str, n: int) -> Generator[str, None, None]: return ngrams def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]: + """ + Generates all possible ngrams from a given sequence of text. + Considering all ngram lengths between the minimum and maximum length. + + :param seq: The sequence to generate ngrams from. + :param min_ngram: The minimum length of ngram to consider. + :param max_ngram: The maximum length of ngram to consider. + :return: A set of all ngrams from the given sequence. + """ lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq)) ngrams = map(partial(self._ngram, seq), lengths) res = set(chain.from_iterable(ngrams)) return res - def _find_longest_common_ngram(self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3) -> str: + def _find_longest_common_ngram(self, sequences: List[str], min_ngram: int = 3, max_ngram: int = 30) -> str: """ - Find the longest common ngram across different text sequences (e.g. start of pages). - Considering all ngrams between the specified range. Helpful for finding footers, headers etc. + Find the longest common ngram across a list of text sequences (e.g. start of pages). + Considering all ngram lengths between the minimum and maximum length. Helpful for finding footers, headers etc. + Empty sequences are ignored. - :param sequences: list[str], list of strings that shall be searched for common n_grams - :param max_ngram: int, maximum length of ngram to consider - :param min_ngram: minimum length of ngram to consider - :return: str, common string of all sections + :param sequences: The list of strings that shall be searched for common n_grams. + :param max_ngram: The maximum length of ngram to consider. + :param min_ngram: The minimum length of ngram to consider. + :return: The longest ngram that all sequences have in common. """ sequences = [s for s in sequences if s] # filter empty sequences if not sequences: diff --git a/releasenotes/notes/text-document-cleaner-8afce831a2ac31ae.yaml b/releasenotes/notes/text-document-cleaner-8afce831a2ac31ae.yaml index f15d8d82f5..cde155a938 100644 --- a/releasenotes/notes/text-document-cleaner-8afce831a2ac31ae.yaml +++ b/releasenotes/notes/text-document-cleaner-8afce831a2ac31ae.yaml @@ -1,5 +1,5 @@ --- preview: - | - Add TextDocumentCleaner, which removes extra whitespace, empty lines, headers, etc. from Documents containing text. + Added DocumentCleaner, which removes extra whitespace, empty lines, headers, etc. from Documents containing text. Useful as a preprocessing step before splitting into shorter text documents. diff --git a/test/preview/components/preprocessors/test_text_document_cleaner.py b/test/preview/components/preprocessors/test_text_document_cleaner.py index 565ec46dd9..d8106972fd 100644 --- a/test/preview/components/preprocessors/test_text_document_cleaner.py +++ b/test/preview/components/preprocessors/test_text_document_cleaner.py @@ -3,22 +3,46 @@ import pytest from haystack.preview import Document -from haystack.preview.components.preprocessors import TextDocumentCleaner +from haystack.preview.components.preprocessors import DocumentCleaner -class TestTextDocumentCleaner: +class TestDocumentCleaner: + @pytest.mark.unit + def test_init(self): + cleaner = DocumentCleaner() + assert cleaner.remove_empty_lines == True + assert cleaner.remove_extra_whitespaces == True + assert cleaner.remove_repeated_substrings == False + assert cleaner.remove_substrings is None + assert cleaner.remove_regex is None + @pytest.mark.unit def test_to_dict(self): - component = TextDocumentCleaner( + cleaner = DocumentCleaner() + data = cleaner.to_dict() + assert data == { + "type": "DocumentCleaner", + "init_parameters": { + "remove_empty_lines": True, + "remove_extra_whitespaces": True, + "remove_repeated_substrings": False, + "remove_substrings": None, + "remove_regex": None, + }, + } + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + cleaner = DocumentCleaner( remove_empty_lines=False, remove_extra_whitespaces=False, remove_repeated_substrings=True, remove_substrings=["a", "b"], remove_regex=r"\s\s+", ) - data = component.to_dict() + data = cleaner.to_dict() assert data == { - "type": "TextDocumentCleaner", + "type": "DocumentCleaner", "init_parameters": { "remove_empty_lines": False, "remove_extra_whitespaces": False, @@ -31,7 +55,7 @@ def test_to_dict(self): @pytest.mark.unit def test_from_dict(self): data = { - "type": "TextDocumentCleaner", + "type": "DocumentCleaner", "init_parameters": { "remove_empty_lines": False, "remove_extra_whitespaces": False, @@ -40,35 +64,35 @@ def test_from_dict(self): "remove_regex": r"\s\s+", }, } - component = TextDocumentCleaner.from_dict(data) - assert component.remove_empty_lines == False - assert component.remove_extra_whitespaces == False - assert component.remove_repeated_substrings == True - assert component.remove_substrings == ["a", "b"] - assert component.remove_regex == r"\s\s+" + cleaner = DocumentCleaner.from_dict(data) + assert cleaner.remove_empty_lines == False + assert cleaner.remove_extra_whitespaces == False + assert cleaner.remove_repeated_substrings == True + assert cleaner.remove_substrings == ["a", "b"] + assert cleaner.remove_regex == r"\s\s+" @pytest.mark.unit def test_non_text_document(self, caplog): with caplog.at_level(logging.WARNING): - cleaner = TextDocumentCleaner() + cleaner = DocumentCleaner() cleaner.run(documents=[Document()]) - assert "TextDocumentCleaner only works with text documents but document.text for document ID" in caplog.text + assert "DocumentCleaner only cleans text documents but document.text for document ID" in caplog.text @pytest.mark.unit def test_single_document(self): - with pytest.raises(TypeError, match="TextDocumentCleaner expects a List of Documents as input."): - cleaner = TextDocumentCleaner() + with pytest.raises(TypeError, match="DocumentCleaner expects a List of Documents as input."): + cleaner = DocumentCleaner() cleaner.run(documents=Document()) @pytest.mark.unit def test_empty_list(self): - cleaner = TextDocumentCleaner() + cleaner = DocumentCleaner() result = cleaner.run(documents=[]) assert result == {"documents": []} @pytest.mark.unit def test_remove_empty_lines(self): - cleaner = TextDocumentCleaner(remove_extra_whitespaces=False) + cleaner = DocumentCleaner(remove_extra_whitespaces=False) result = cleaner.run( documents=[ Document( @@ -88,7 +112,7 @@ def test_remove_empty_lines(self): @pytest.mark.unit def test_remove_whitespaces(self): - cleaner = TextDocumentCleaner(remove_empty_lines=False) + cleaner = DocumentCleaner(remove_empty_lines=False) result = cleaner.run( documents=[ Document( @@ -107,21 +131,21 @@ def test_remove_whitespaces(self): @pytest.mark.unit def test_remove_substrings(self): - cleaner = TextDocumentCleaner(remove_substrings=["This", "A", "words"]) - result = cleaner.run(documents=[Document(text="This is a text with some words.")]) + cleaner = DocumentCleaner(remove_substrings=["This", "A", "words", "🪲"]) + result = cleaner.run(documents=[Document(text="This is a text with some words.🪲")]) assert len(result["documents"]) == 1 assert result["documents"][0].text == " is a text with some ." @pytest.mark.unit def test_remove_regex(self): - cleaner = TextDocumentCleaner(remove_regex=r"\s\s+") + cleaner = DocumentCleaner(remove_regex=r"\s\s+") result = cleaner.run(documents=[Document(text="This is a text with some words.")]) assert len(result["documents"]) == 1 assert result["documents"][0].text == "This is a text with some words." @pytest.mark.unit def test_remove_repeated_substrings(self): - cleaner = TextDocumentCleaner( + cleaner = DocumentCleaner( remove_empty_lines=False, remove_extra_whitespaces=False, remove_repeated_substrings=True ) From 97bd916378a97c52a9ab68b82f3391953072f991 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 13 Oct 2023 10:39:34 +0200 Subject: [PATCH 7/9] mention \f page separator for header/footer removal --- .../components/preprocessors/text_document_cleaner.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/haystack/preview/components/preprocessors/text_document_cleaner.py b/haystack/preview/components/preprocessors/text_document_cleaner.py index 854e1d6d71..583dc83f42 100644 --- a/haystack/preview/components/preprocessors/text_document_cleaner.py +++ b/haystack/preview/components/preprocessors/text_document_cleaner.py @@ -13,7 +13,7 @@ @component class DocumentCleaner: """ - Makes text documents more readable by removing extra whitespaces, empty lines, specified substrings, regexes, headers and footers (in this order). + Makes text documents more readable by removing extra whitespaces, empty lines, specified substrings, regexes, page headers and footers (in this order). This is useful for preparing the documents for further processing by LLMs. Example usage in an indexing pipeline: @@ -39,7 +39,7 @@ def __init__( """ :param remove_empty_lines: Whether to remove empty lines. :param remove_extra_whitespaces: Whether to remove extra whitespaces. - :param remove_repeated_substrings: Whether to remove repeated substrings, such as headers and footers. + :param remove_repeated_substrings: Whether to remove repeated substrings from pages, such as headers and footers. Pages in the text need to be separated by form feed character "\f", which is supported by TextFileToDocument and AzureOCRDocumentConverter. :param remove_substrings: List of substrings to remove from the text. :param remove_regex: Regex to match and replace substrings by "". """ @@ -143,7 +143,8 @@ def _remove_substrings(self, text: str, substrings: List[str]) -> str: def _remove_repeated_substrings(self, text: str) -> str: """ - Remove any substrings from the text that occur repeatedly. For example headers or footers. + Remove any substrings from the text that occur repeatedly on every page. For example headers or footers. + Pages in the text need to be separated by form feed character "\f". :param text: Text to clean. :return: The text without the repeated substrings. """ @@ -156,6 +157,7 @@ def _find_and_remove_header_footer( ) -> str: """ Heuristic to find footers and headers across different pages by searching for the longest common string. + Pages in the text need to be separated by form feed character "\f". For headers, we only search in the first n_chars characters (for footer: last n_chars). Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4" or similar. From c3cdd864b0bd50ada5f1a3515cc76fe2a97daf9f Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 13 Oct 2023 10:41:40 +0200 Subject: [PATCH 8/9] mention \f page separator for header/footer removal --- .../preview/components/preprocessors/text_document_cleaner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/haystack/preview/components/preprocessors/text_document_cleaner.py b/haystack/preview/components/preprocessors/text_document_cleaner.py index 583dc83f42..ea39f07e36 100644 --- a/haystack/preview/components/preprocessors/text_document_cleaner.py +++ b/haystack/preview/components/preprocessors/text_document_cleaner.py @@ -39,7 +39,9 @@ def __init__( """ :param remove_empty_lines: Whether to remove empty lines. :param remove_extra_whitespaces: Whether to remove extra whitespaces. - :param remove_repeated_substrings: Whether to remove repeated substrings from pages, such as headers and footers. Pages in the text need to be separated by form feed character "\f", which is supported by TextFileToDocument and AzureOCRDocumentConverter. + :param remove_repeated_substrings: Whether to remove repeated substrings (headers/footers) from pages. + Pages in the text need to be separated by form feed character "\f", + which is supported by TextFileToDocument and AzureOCRDocumentConverter. :param remove_substrings: List of substrings to remove from the text. :param remove_regex: Regex to match and replace substrings by "". """ From 365b323ed14894710536a66335c4bfd9d3b3e56a Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 13 Oct 2023 12:06:44 +0200 Subject: [PATCH 9/9] mark example usage as python code --- .../preview/components/preprocessors/text_document_cleaner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/haystack/preview/components/preprocessors/text_document_cleaner.py b/haystack/preview/components/preprocessors/text_document_cleaner.py index ea39f07e36..0c0693d277 100644 --- a/haystack/preview/components/preprocessors/text_document_cleaner.py +++ b/haystack/preview/components/preprocessors/text_document_cleaner.py @@ -17,6 +17,8 @@ class DocumentCleaner: This is useful for preparing the documents for further processing by LLMs. Example usage in an indexing pipeline: + + ```python document_store = MemoryDocumentStore() p = Pipeline() p.add_component(instance=TextFileToDocument(), name="text_file_converter") @@ -26,6 +28,7 @@ class DocumentCleaner: p.connect("text_file_converter.documents", "cleaner.documents") p.connect("cleaner.documents", "splitter.documents") p.connect("splitter.documents", "writer.documents") + ``` """ def __init__(