From a49fc93fab847f86197ed32036bd462a30d18591 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 20 Nov 2024 15:09:53 +0100 Subject: [PATCH 01/82] initial import --- .../preprocessors/recursive_chunker.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 haystack/components/preprocessors/recursive_chunker.py diff --git a/haystack/components/preprocessors/recursive_chunker.py b/haystack/components/preprocessors/recursive_chunker.py new file mode 100644 index 0000000000..d1f5146c5c --- /dev/null +++ b/haystack/components/preprocessors/recursive_chunker.py @@ -0,0 +1,23 @@ +from haystack import component + + +@component +class RecursiveChunker: + def __init__(self): + pass + + def _chunk_text(self, text): + # some logic to split text into smaller chunks + return text + + def run(self, documents): + """ + Split text of documents into smaller chunks recursively. + + :param documents: + :returns: + Documents with text split into smaller chunks + """ + for doc in documents: + doc.text = self._chunk_text(doc.text) + return documents From 41f5f64c58fe54f739f17ed98bc5396c2310e31b Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 20 Nov 2024 16:56:11 +0100 Subject: [PATCH 02/82] initial import --- .../preprocessors/recursive_chunker.py | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/haystack/components/preprocessors/recursive_chunker.py b/haystack/components/preprocessors/recursive_chunker.py index d1f5146c5c..ee7af9127d 100644 --- a/haystack/components/preprocessors/recursive_chunker.py +++ b/haystack/components/preprocessors/recursive_chunker.py @@ -1,12 +1,26 @@ +from typing import List + from haystack import component @component class RecursiveChunker: - def __init__(self): - pass + def __init__( + self, + chunk_size: int, + chunk_overlap: int, + separators: List[str], + keep_separator: bool = True, + is_separator_regex: bool = False, + ): + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + self.separators = separators + self.keep_separator = keep_separator + self.is_separator_regex = is_separator_regex - def _chunk_text(self, text): + @staticmethod + def _chunk_text(text): # some logic to split text into smaller chunks return text From 79c669ecd4604847bda36a777d08a58db2c5efff Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 20 Nov 2024 17:33:20 +0100 Subject: [PATCH 03/82] wip --- .../components/preprocessors/recursive_chunker.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_chunker.py b/haystack/components/preprocessors/recursive_chunker.py index ee7af9127d..d321b306e9 100644 --- a/haystack/components/preprocessors/recursive_chunker.py +++ b/haystack/components/preprocessors/recursive_chunker.py @@ -21,7 +21,19 @@ def __init__( @staticmethod def _chunk_text(text): - # some logic to split text into smaller chunks + # 1. identify all occurrences of the first splitting character in the text + + # 2. split the text at the first occurrence of the splitting character + + # 3. assessing each split to check whether they meet the condition of being smaller than our specified chunk + # size + + # 4. splits that satisfy this condition can be labeled as good splits. + # 4.1 combine good splits if each individual split is smaller than the chunk size + + # 5. splits that don't satisfy the condition of being smaller than the chunk size can be labeled as bad splits + # 5.1 split the bad splits recursively until they meet the condition of being smaller than the chunk size + return text def run(self, documents): From 87b8023a7e0c70b4ca7417f70465cea6170c8d31 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 25 Nov 2024 17:40:50 +0100 Subject: [PATCH 04/82] adding initial version + tests --- .../preprocessors/recursive_chunker.py | 97 +++++++++++++++---- .../preprocessors/test_recursive_chunker.py | 33 +++++++ 2 files changed, 111 insertions(+), 19 deletions(-) create mode 100644 test/components/preprocessors/test_recursive_chunker.py diff --git a/haystack/components/preprocessors/recursive_chunker.py b/haystack/components/preprocessors/recursive_chunker.py index d321b306e9..9246a87453 100644 --- a/haystack/components/preprocessors/recursive_chunker.py +++ b/haystack/components/preprocessors/recursive_chunker.py @@ -1,6 +1,7 @@ +import re from typing import List -from haystack import component +from haystack import Document, component @component @@ -19,31 +20,89 @@ def __init__( self.keep_separator = keep_separator self.is_separator_regex = is_separator_regex - @staticmethod - def _chunk_text(text): - # 1. identify all occurrences of the first splitting character in the text + def _apply_overlap(self, chunks: List[str]) -> List[str]: + if self.chunk_overlap <= 0: + return chunks - # 2. split the text at the first occurrence of the splitting character + overlapped_chunks = [] + for i in range(len(chunks)): + if i > 0: + # Add overlap from previous chunk + overlap_start = max(0, len(chunks[i - 1]) - self.chunk_overlap) + current_chunk = chunks[i - 1][overlap_start:] + chunks[i] + overlapped_chunks.append(current_chunk) + else: + overlapped_chunks.append(chunks[i]) + return overlapped_chunks - # 3. assessing each split to check whether they meet the condition of being smaller than our specified chunk - # size + def _chunk_text(self, text: str) -> List[str]: + if not text: + return [] - # 4. splits that satisfy this condition can be labeled as good splits. - # 4.1 combine good splits if each individual split is smaller than the chunk size + if len(text) <= self.chunk_size: + return [text] - # 5. splits that don't satisfy the condition of being smaller than the chunk size can be labeled as bad splits - # 5.1 split the bad splits recursively until they meet the condition of being smaller than the chunk size + # Try each separator in order + for separator in self.separators: + # split using the current separator + splits = text.split(separator) if not self.is_separator_regex else re.split(separator, text) - return text + # filter out empty splits + splits = [s for s in splits if s.strip()] - def run(self, documents): + if len(splits) == 1: # go to next separator, if current separator not found + continue + + chunks = [] + current_chunk = [] + current_length = 0 + + # check splits, if any is too long, recursively chunk it, otherwise add to current chunk + for split in splits: + split_text = split + if self.keep_separator: + split_text = separator + split if split != splits[0] else split + + # if adding this split exceeds chunk_size, process current_chunk + if current_length + len(split_text) > self.chunk_size: + if current_chunk: # Save the good splits + chunks.append("".join(current_chunk)) + current_chunk = [] + current_length = 0 + + # recursively handle splits that are too large + if len(split_text) > self.chunk_size: + chunks.extend(self._chunk_text(split_text)) + else: + chunks.append(split_text) + else: + current_chunk.append(split_text) + current_length += len(split_text) + + if current_chunk: + chunks.append("".join(current_chunk)) + + chunks = self._apply_overlap(chunks) + + return chunks + + # If no separator worked, fall back to character-level chunking + return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)] + + def _run_one(self, doc: Document) -> List[Document]: + new_docs = [] + chunks = self._chunk_text(doc.content) + for chunk in chunks: + new_doc = Document(content=chunk, meta=doc.meta) + new_doc.meta["original_id"] = doc.id + new_docs.append(new_doc) + return new_docs + + def run(self, documents: List[Document]) -> List[Document]: """ Split text of documents into smaller chunks recursively. - - :param documents: - :returns: - Documents with text split into smaller chunks """ + new_docs = [] for doc in documents: - doc.text = self._chunk_text(doc.text) - return documents + new_docs.extend(self._run_one(doc)) + return new_docs diff --git a/test/components/preprocessors/test_recursive_chunker.py b/test/components/preprocessors/test_recursive_chunker.py new file mode 100644 index 0000000000..3475f3809c --- /dev/null +++ b/test/components/preprocessors/test_recursive_chunker.py @@ -0,0 +1,33 @@ +import pytest +from haystack.components.preprocessors.recursive_chunker import RecursiveChunker +from haystack.dataclasses import Document + + +@pytest.mark.parametrize("keep_separator", [True, False]) +def test_chunk_text_with_simple_separator(chunk_size, chunk_overlap, separators, keep_separator): + chunker = RecursiveChunker(chunk_size=20, chunk_overlap=0, separators=["."], keep_separator=keep_separator) + + text = "This is a test. Another sentence. And one more." + chunks = chunker._chunk_text(text) + + assert len(chunks) == 3 + assert chunks[0] == "This is a test." + assert chunks[1] == ". Another sentence." + assert chunks[2] == ". And one more." + + +def test_chunk_text_with_multiple_separators_recursive(): + # try: paragraph, newline, sentence, space + + chunker = RecursiveChunker(chunk_size=50, chunk_overlap=0, separators=["\n\n", "\n", ".", " "], keep_separator=True) + + # This text has paragraph breaks, newlines, sentences, and spaces + text = """Artificial intelligence (AI) - Introduction + +AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. +It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals. Such machines may be called AIs. + +AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go).[2] However, many AI applications are not perceived as AI: "A lot of cutting edge AI has filtered into general applications, often without being called AI because once something becomes useful enough and common enough it's not labeled AI anymore." +""" + + chunks = chunker._chunk_text(text) From 09b25f3490ab4cd391e45808bd46a12a854152e1 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 2 Dec 2024 17:33:39 +0100 Subject: [PATCH 05/82] adding more tests --- .../preprocessors/document_cleaner.py | 2 +- .../preprocessors/document_splitter.py | 2 +- .../preprocessors/recursive_chunker.py | 73 ++++++++++++++----- .../preprocessors/test_recursive_chunker.py | 58 ++++++++++++--- 4 files changed, 106 insertions(+), 29 deletions(-) diff --git a/haystack/components/preprocessors/document_cleaner.py b/haystack/components/preprocessors/document_cleaner.py index d56006130c..151932506d 100644 --- a/haystack/components/preprocessors/document_cleaner.py +++ b/haystack/components/preprocessors/document_cleaner.py @@ -38,7 +38,7 @@ class DocumentCleaner: ``` """ - def __init__( + def __init__( # pylint: disable=too-many-positional-arguments self, remove_empty_lines: bool = True, remove_extra_whitespaces: bool = True, diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index 86d95f412a..f81df10974 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -14,7 +14,7 @@ logger = logging.getLogger(__name__) # Maps the 'split_by' argument to the actual char used to split the Documents. -# 'function' is not in the mapping cause it doesn't split on chars. +# 'function' is not in the mapping because it doesn't split on chars. _SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "sentence": ".", "word": " ", "line": "\n"} diff --git a/haystack/components/preprocessors/recursive_chunker.py b/haystack/components/preprocessors/recursive_chunker.py index 9246a87453..e7921bd37a 100644 --- a/haystack/components/preprocessors/recursive_chunker.py +++ b/haystack/components/preprocessors/recursive_chunker.py @@ -1,12 +1,14 @@ import re from typing import List -from haystack import Document, component +from haystack import Document, component, logging + +logger = logging.getLogger(__name__) @component class RecursiveChunker: - def __init__( + def __init__( # pylint: disable=too-many-positional-arguments self, chunk_size: int, chunk_overlap: int, @@ -19,33 +21,66 @@ def __init__( self.separators = separators self.keep_separator = keep_separator self.is_separator_regex = is_separator_regex + if "sentence" in separators: + self._check_if_nltk_is_installed() + + @staticmethod + def _check_if_nltk_is_installed(): + try: + import nltk + + nltk.data.find("tokenizers/punkt") + except (LookupError, ModuleNotFoundError): + raise Exception("You need to install NLTK to use this function. You can install it via `pip install nltk`") def _apply_overlap(self, chunks: List[str]) -> List[str]: + """ + Applies an overlap between consecutive chunks if the chunk_overlap attribute is greater than zero. + + :param chunks: + :returns: + The list of chunks with overlap applied. + """ if self.chunk_overlap <= 0: return chunks overlapped_chunks = [] - for i in range(len(chunks)): - if i > 0: - # Add overlap from previous chunk - overlap_start = max(0, len(chunks[i - 1]) - self.chunk_overlap) - current_chunk = chunks[i - 1][overlap_start:] + chunks[i] + for idx, chunk in enumerate(chunks): + if idx > 0: + # adds an overlap from previous chunk + overlap_start = max(0, len(chunks[idx - 1]) - self.chunk_overlap) + current_chunk = chunks[idx - 1][overlap_start:] + chunk overlapped_chunks.append(current_chunk) else: - overlapped_chunks.append(chunks[i]) + overlapped_chunks.append(chunk) return overlapped_chunks def _chunk_text(self, text: str) -> List[str]: - if not text: - return [] + """ + Recursive chunking algorithm that divides text into smaller chunks based on a list of separator characters. + + It starts with a list of separator characters (e.g., ["\n\n", "\n", " ", ""]) and attempts to divide the text + using the first separator. If the resulting chunks are still larger than the specified chunk size, it moves to + the next separator in the list. + This process continues recursively, using progressively less specific separators until the chunks meet the + desired size criteria. + :param text: + :returns: + A list of text chunks. + """ if len(text) <= self.chunk_size: return [text] - # Try each separator in order + # try each separator for separator in self.separators: - # split using the current separator - splits = text.split(separator) if not self.is_separator_regex else re.split(separator, text) + if separator in "sentence": + from nltk.tokenize import sent_tokenize + + splits = sent_tokenize(text) + else: + # split using the current separator + splits = text.split(separator) if not self.is_separator_regex else re.split(separator, text) # filter out empty splits splits = [s for s in splits if s.strip()] @@ -54,14 +89,14 @@ def _chunk_text(self, text: str) -> List[str]: continue chunks = [] - current_chunk = [] + current_chunk: List[str] = [] current_length = 0 # check splits, if any is too long, recursively chunk it, otherwise add to current chunk for split in splits: split_text = split - if self.keep_separator: - split_text = separator + split if split != splits[0] else split + if self.keep_separator and separator != "sentence": + split_text = split + separator # if adding this split exceeds chunk_size, process current_chunk if current_length + len(split_text) > self.chunk_size: @@ -91,7 +126,8 @@ def _chunk_text(self, text: str) -> List[str]: def _run_one(self, doc: Document) -> List[Document]: new_docs = [] - chunks = self._chunk_text(doc.content) + # NOTE: the check for a non-empty content is already done in the run method + chunks = self._chunk_text(doc.content) # type: ignore for chunk in chunks: new_doc = Document(content=chunk, meta=doc.meta) new_doc.meta["original_id"] = doc.id @@ -104,5 +140,8 @@ def run(self, documents: List[Document]) -> List[Document]: """ new_docs = [] for doc in documents: + if not doc.content or doc.content == "": + logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id) + continue new_docs.extend(self._run_one(doc)) return new_docs diff --git a/test/components/preprocessors/test_recursive_chunker.py b/test/components/preprocessors/test_recursive_chunker.py index 3475f3809c..4714f4b67b 100644 --- a/test/components/preprocessors/test_recursive_chunker.py +++ b/test/components/preprocessors/test_recursive_chunker.py @@ -4,30 +4,68 @@ @pytest.mark.parametrize("keep_separator", [True, False]) -def test_chunk_text_with_simple_separator(chunk_size, chunk_overlap, separators, keep_separator): +def test_chunk_text_with_simple_separator(keep_separator): chunker = RecursiveChunker(chunk_size=20, chunk_overlap=0, separators=["."], keep_separator=keep_separator) text = "This is a test. Another sentence. And one more." chunks = chunker._chunk_text(text) - assert len(chunks) == 3 - assert chunks[0] == "This is a test." - assert chunks[1] == ". Another sentence." - assert chunks[2] == ". And one more." + if keep_separator: + assert len(chunks) == 3 + assert chunks[0] == "This is a test." + assert chunks[1] == " Another sentence." + assert chunks[2] == " And one more." + else: + assert len(chunks) == 3 + assert chunks[0] == "This is a test" + assert chunks[1] == " Another sentence" + assert chunks[2] == " And one more" def test_chunk_text_with_multiple_separators_recursive(): # try: paragraph, newline, sentence, space - chunker = RecursiveChunker(chunk_size=50, chunk_overlap=0, separators=["\n\n", "\n", ".", " "], keep_separator=True) + chunker = RecursiveChunker( + chunk_size=260, chunk_overlap=0, separators=["\n\n", "\n", ".", " "], keep_separator=True + ) - # This text has paragraph breaks, newlines, sentences, and spaces text = """Artificial intelligence (AI) - Introduction AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. -It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals. Such machines may be called AIs. +AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; g1enerative and creative tools; and superhuman play and analysis in strategy games.""" # noqa: E501 + chunks = chunker._chunk_text(text) + assert len(chunks) == 4 + assert chunks[0] == "Artificial intelligence (AI) - Introduction\n\n" + assert ( + chunks[1] + == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. \n" + ) # noqa: E501 + assert chunks[2] == "AI technology is widely used throughout industry, government, and science." + assert ( + chunks[3] + == " Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games." + ) # noqa: E501 + + +def test_chunk_text_using_nltk_sentence(): + chunker = RecursiveChunker( + chunk_size=400, chunk_overlap=0, separators=["\n\n", "\n", "sentence", " "], keep_separator=True + ) -AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go).[2] However, many AI applications are not perceived as AI: "A lot of cutting edge AI has filtered into general applications, often without being called AI because once something becomes useful enough and common enough it's not labeled AI anymore." -""" + text = """Artificial intelligence (AI) - Introduction + +AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. +AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go).""" # noqa: E501 chunks = chunker._chunk_text(text) + assert len(chunks) == 4 + assert chunks[0] == "Artificial intelligence (AI) - Introduction\n\n" + assert ( + chunks[1] + == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. \n" + ) # noqa: E501 + assert chunks[2] == "AI technology is widely used throughout industry, government, and science." # noqa: E501 + assert ( + chunks[3] + == "Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go)." # noqa: E501 + ) From a39f4815ed2d2e68328981d8afc7f90141ee6b8d Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 2 Dec 2024 18:35:53 +0100 Subject: [PATCH 06/82] more tests --- .../preprocessors/recursive_chunker.py | 35 ++++++----- .../preprocessors/test_recursive_chunker.py | 62 ++++++++++++++++--- 2 files changed, 72 insertions(+), 25 deletions(-) diff --git a/haystack/components/preprocessors/recursive_chunker.py b/haystack/components/preprocessors/recursive_chunker.py index e7921bd37a..d1f932907b 100644 --- a/haystack/components/preprocessors/recursive_chunker.py +++ b/haystack/components/preprocessors/recursive_chunker.py @@ -21,8 +21,18 @@ def __init__( # pylint: disable=too-many-positional-arguments self.separators = separators self.keep_separator = keep_separator self.is_separator_regex = is_separator_regex + self._check_params() if "sentence" in separators: self._check_if_nltk_is_installed() + from nltk.tokenize import sent_tokenize + + self.nltk_tokenizer = sent_tokenize + + def _check_params(self): + if self.chunk_overlap < 0: + raise ValueError("Overlap must be greater than zero.") + if self.chunk_overlap >= self.chunk_size: + raise ValueError("Overlap cannot be greater than or equal to the chunk size.") @staticmethod def _check_if_nltk_is_installed(): @@ -37,22 +47,18 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: """ Applies an overlap between consecutive chunks if the chunk_overlap attribute is greater than zero. - :param chunks: + :param chunks: List of text chunks. :returns: The list of chunks with overlap applied. """ - if self.chunk_overlap <= 0: - return chunks - overlapped_chunks = [] for idx, chunk in enumerate(chunks): - if idx > 0: - # adds an overlap from previous chunk - overlap_start = max(0, len(chunks[idx - 1]) - self.chunk_overlap) - current_chunk = chunks[idx - 1][overlap_start:] + chunk - overlapped_chunks.append(current_chunk) - else: + if idx == 0: overlapped_chunks.append(chunk) + continue + overlap_start = max(0, len(chunks[idx - 1]) - self.chunk_overlap) + current_chunk = chunks[idx - 1][overlap_start:] + chunk + overlapped_chunks.append(current_chunk) return overlapped_chunks def _chunk_text(self, text: str) -> List[str]: @@ -75,9 +81,7 @@ def _chunk_text(self, text: str) -> List[str]: # try each separator for separator in self.separators: if separator in "sentence": - from nltk.tokenize import sent_tokenize - - splits = sent_tokenize(text) + splits = self.nltk_tokenizer(text) else: # split using the current separator splits = text.split(separator) if not self.is_separator_regex else re.split(separator, text) @@ -100,7 +104,7 @@ def _chunk_text(self, text: str) -> List[str]: # if adding this split exceeds chunk_size, process current_chunk if current_length + len(split_text) > self.chunk_size: - if current_chunk: # Save the good splits + if current_chunk: # keep the good splits chunks.append("".join(current_chunk)) current_chunk = [] current_length = 0 @@ -117,7 +121,8 @@ def _chunk_text(self, text: str) -> List[str]: if current_chunk: chunks.append("".join(current_chunk)) - chunks = self._apply_overlap(chunks) + if self.chunk_overlap > 0: + chunks = self._apply_overlap(chunks) return chunks diff --git a/test/components/preprocessors/test_recursive_chunker.py b/test/components/preprocessors/test_recursive_chunker.py index 4714f4b67b..c272ba95e0 100644 --- a/test/components/preprocessors/test_recursive_chunker.py +++ b/test/components/preprocessors/test_recursive_chunker.py @@ -1,6 +1,47 @@ import pytest from haystack.components.preprocessors.recursive_chunker import RecursiveChunker -from haystack.dataclasses import Document + + +def test_init_with_negative_overlap(): + with pytest.raises(ValueError): + _ = RecursiveChunker(chunk_size=20, chunk_overlap=-1, separators=["."]) + + +def test_init_with_overlap_greater_than_chunk_size(): + with pytest.raises(ValueError): + _ = RecursiveChunker(chunk_size=10, chunk_overlap=15, separators=["."]) + + +def test_apply_overlap_no_overlap(): + # Test the case where there is no overlap between chunks + chunker = RecursiveChunker(chunk_size=20, chunk_overlap=0, separators=["."]) + chunks = ["chunk1", "chunk2", "chunk3"] + result = chunker._apply_overlap(chunks) + assert result == ["chunk1", "chunk2", "chunk3"] + + +def test_apply_overlap_with_overlap(): + # Test the case where there is overlap between chunks + chunker = RecursiveChunker(chunk_size=20, chunk_overlap=4, separators=["."]) + chunks = ["chunk1", "chunk2", "chunk3"] + result = chunker._apply_overlap(chunks) + assert result == ["chunk1", "unk1chunk2", "unk2chunk3"] + + +def test_apply_overlap_single_chunk(): + # Test the case where there is only one chunk + chunker = RecursiveChunker(chunk_size=20, chunk_overlap=3, separators=["."]) + chunks = ["chunk1"] + result = chunker._apply_overlap(chunks) + assert result == ["chunk1"] + + +def test_chunk_text_smaller_than_chunk_size(): + chunker = RecursiveChunker(chunk_size=20, chunk_overlap=0, separators=["."]) + text = "small text" + chunks = chunker._chunk_text(text) + assert len(chunks) == 1 + assert chunks[0] == text @pytest.mark.parametrize("keep_separator", [True, False]) @@ -23,22 +64,19 @@ def test_chunk_text_with_simple_separator(keep_separator): def test_chunk_text_with_multiple_separators_recursive(): - # try: paragraph, newline, sentence, space - chunker = RecursiveChunker( chunk_size=260, chunk_overlap=0, separators=["\n\n", "\n", ".", " "], keep_separator=True ) - text = """Artificial intelligence (AI) - Introduction AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. -AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; g1enerative and creative tools; and superhuman play and analysis in strategy games.""" # noqa: E501 +AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games.""" # noqa: E501 chunks = chunker._chunk_text(text) assert len(chunks) == 4 assert chunks[0] == "Artificial intelligence (AI) - Introduction\n\n" assert ( chunks[1] - == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. \n" + == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n" ) # noqa: E501 assert chunks[2] == "AI technology is widely used throughout industry, government, and science." assert ( @@ -48,10 +86,14 @@ def test_chunk_text_with_multiple_separators_recursive(): def test_chunk_text_using_nltk_sentence(): + """ + This test includes abbreviations that are not handled by the simple sentence tokenizer based on "." and + requires a more sophisticated sentence tokenizer like the one provided by NLTK. + """ + chunker = RecursiveChunker( chunk_size=400, chunk_overlap=0, separators=["\n\n", "\n", "sentence", " "], keep_separator=True ) - text = """Artificial intelligence (AI) - Introduction AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. @@ -62,10 +104,10 @@ def test_chunk_text_using_nltk_sentence(): assert chunks[0] == "Artificial intelligence (AI) - Introduction\n\n" assert ( chunks[1] - == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. \n" + == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n" ) # noqa: E501 assert chunks[2] == "AI technology is widely used throughout industry, government, and science." # noqa: E501 assert ( chunks[3] - == "Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go)." # noqa: E501 - ) + == "Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go)." + ) # noqa: E501 From db82194e89d5409deb93f696bf4c41b812e6a8e0 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 4 Dec 2024 11:32:10 +0100 Subject: [PATCH 07/82] incorporating SentenceSplitter based on NLTK --- .../components/preprocessors/recursive_chunker.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/haystack/components/preprocessors/recursive_chunker.py b/haystack/components/preprocessors/recursive_chunker.py index d1f932907b..fc159be081 100644 --- a/haystack/components/preprocessors/recursive_chunker.py +++ b/haystack/components/preprocessors/recursive_chunker.py @@ -23,10 +23,7 @@ def __init__( # pylint: disable=too-many-positional-arguments self.is_separator_regex = is_separator_regex self._check_params() if "sentence" in separators: - self._check_if_nltk_is_installed() - from nltk.tokenize import sent_tokenize - - self.nltk_tokenizer = sent_tokenize + self.nltk_tokenizer = self._get_custom_sentence_tokenizer() def _check_params(self): if self.chunk_overlap < 0: @@ -35,13 +32,12 @@ def _check_params(self): raise ValueError("Overlap cannot be greater than or equal to the chunk size.") @staticmethod - def _check_if_nltk_is_installed(): + def _get_custom_sentence_tokenizer(): try: - import nltk - - nltk.data.find("tokenizers/punkt") + from haystack.components.preprocessors.sentence_tokenizer import Language, SentenceSplitter, nltk_imports except (LookupError, ModuleNotFoundError): raise Exception("You need to install NLTK to use this function. You can install it via `pip install nltk`") + return SentenceSplitter(language="en") def _apply_overlap(self, chunks: List[str]) -> List[str]: """ @@ -81,7 +77,8 @@ def _chunk_text(self, text: str) -> List[str]: # try each separator for separator in self.separators: if separator in "sentence": - splits = self.nltk_tokenizer(text) + sentence_with_spans = self.nltk_tokenizer.split_sentences(text) + splits = [sentence["sentence"] for sentence in sentence_with_spans] else: # split using the current separator splits = text.split(separator) if not self.is_separator_regex else re.split(separator, text) From cbfcc66dc67e6ffccb8db18d989225d9f4c59c99 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 4 Dec 2024 16:01:31 +0100 Subject: [PATCH 08/82] adding more tests --- .../preprocessors/recursive_chunker.py | 35 +++-- .../preprocessors/test_recursive_chunker.py | 126 +++++++++++++++--- 2 files changed, 131 insertions(+), 30 deletions(-) diff --git a/haystack/components/preprocessors/recursive_chunker.py b/haystack/components/preprocessors/recursive_chunker.py index fc159be081..06f0e881d2 100644 --- a/haystack/components/preprocessors/recursive_chunker.py +++ b/haystack/components/preprocessors/recursive_chunker.py @@ -1,7 +1,7 @@ import re -from typing import List +from typing import Any, Dict, List -from haystack import Document, component, logging +from haystack import Document, component, default_from_dict, default_to_dict, logging logger = logging.getLogger(__name__) @@ -63,9 +63,8 @@ def _chunk_text(self, text: str) -> List[str]: It starts with a list of separator characters (e.g., ["\n\n", "\n", " ", ""]) and attempts to divide the text using the first separator. If the resulting chunks are still larger than the specified chunk size, it moves to - the next separator in the list. - This process continues recursively, using progressively less specific separators until the chunks meet the - desired size criteria. + the next separator in the list. This process continues recursively, progressively applying each specific + separator until the chunks meet the desired size criteria. :param text: :returns: @@ -76,7 +75,7 @@ def _chunk_text(self, text: str) -> List[str]: # try each separator for separator in self.separators: - if separator in "sentence": + if separator in "sentence": # using nltk sentence tokenizer sentence_with_spans = self.nltk_tokenizer.split_sentences(text) splits = [sentence["sentence"] for sentence in sentence_with_spans] else: @@ -123,12 +122,32 @@ def _chunk_text(self, text: str) -> List[str]: return chunks - # If no separator worked, fall back to character-level chunking + # if no separator worked, fall back to character-level chunking return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)] + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the RecursiveChunker instance to a dictionary. + """ + return default_to_dict( + self, + chunk_size=self.chunk_size, + chunk_overlap=self.chunk_overlap, + separators=self.separators, + keep_separator=self.keep_separator, + is_separator_regex=self.is_separator_regex, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "RecursiveChunker": + """ + Deserializes a dictionary to a RecursiveChunker instance. + """ + return default_from_dict(cls, data) + def _run_one(self, doc: Document) -> List[Document]: new_docs = [] - # NOTE: the check for a non-empty content is already done in the run method + # NOTE: the check for a non-empty content is already done in the run method, hence the type ignore below chunks = self._chunk_text(doc.content) # type: ignore for chunk in chunks: new_doc = Document(content=chunk, meta=doc.meta) diff --git a/test/components/preprocessors/test_recursive_chunker.py b/test/components/preprocessors/test_recursive_chunker.py index c272ba95e0..9ad3b45a58 100644 --- a/test/components/preprocessors/test_recursive_chunker.py +++ b/test/components/preprocessors/test_recursive_chunker.py @@ -1,4 +1,6 @@ import pytest + +from haystack import Document, Pipeline from haystack.components.preprocessors.recursive_chunker import RecursiveChunker @@ -47,7 +49,6 @@ def test_chunk_text_smaller_than_chunk_size(): @pytest.mark.parametrize("keep_separator", [True, False]) def test_chunk_text_with_simple_separator(keep_separator): chunker = RecursiveChunker(chunk_size=20, chunk_overlap=0, separators=["."], keep_separator=keep_separator) - text = "This is a test. Another sentence. And one more." chunks = chunker._chunk_text(text) @@ -63,14 +64,20 @@ def test_chunk_text_with_simple_separator(keep_separator): assert chunks[2] == " And one more" -def test_chunk_text_with_multiple_separators_recursive(): +def test_chunk_text_using_nltk_sentence(): + """ + This test includes abbreviations that are not handled by the simple sentence tokenizer based on "." and + requires a more sophisticated sentence tokenizer like the one provided by NLTK. + """ + chunker = RecursiveChunker( - chunk_size=260, chunk_overlap=0, separators=["\n\n", "\n", ".", " "], keep_separator=True + chunk_size=400, chunk_overlap=0, separators=["\n\n", "\n", "sentence", " "], keep_separator=True ) text = """Artificial intelligence (AI) - Introduction AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. -AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games.""" # noqa: E501 +AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go).""" # noqa: E501 + chunks = chunker._chunk_text(text) assert len(chunks) == 4 assert chunks[0] == "Artificial intelligence (AI) - Introduction\n\n" @@ -78,36 +85,111 @@ def test_chunk_text_with_multiple_separators_recursive(): chunks[1] == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n" ) # noqa: E501 - assert chunks[2] == "AI technology is widely used throughout industry, government, and science." + assert chunks[2] == "AI technology is widely used throughout industry, government, and science." # noqa: E501 assert ( chunks[3] - == " Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games." + == "Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go)." ) # noqa: E501 -def test_chunk_text_using_nltk_sentence(): - """ - This test includes abbreviations that are not handled by the simple sentence tokenizer based on "." and - requires a more sophisticated sentence tokenizer like the one provided by NLTK. - """ +def test_recursive_splitter_empty_documents(): + chunker = RecursiveChunker(chunk_size=20, chunk_overlap=0, separators=["."]) + empty_doc = Document(content="") + doc_chunks = chunker.run([empty_doc]) + assert len(doc_chunks) == 0 + +def test_recursive_chunker_with_multiple_separators_recursive(): chunker = RecursiveChunker( - chunk_size=400, chunk_overlap=0, separators=["\n\n", "\n", "sentence", " "], keep_separator=True + chunk_size=260, chunk_overlap=0, separators=["\n\n", "\n", ".", " "], keep_separator=True ) text = """Artificial intelligence (AI) - Introduction AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. -AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go).""" # noqa: E501 +AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games.""" # noqa: E501 - chunks = chunker._chunk_text(text) - assert len(chunks) == 4 - assert chunks[0] == "Artificial intelligence (AI) - Introduction\n\n" + doc = Document(content=text) + doc_chunks = chunker.run([doc]) + assert len(doc_chunks) == 4 assert ( - chunks[1] + doc_chunks[0].meta["original_id"] + == doc_chunks[1].meta["original_id"] + == doc_chunks[2].meta["original_id"] + == doc_chunks[3].meta["original_id"] + == doc.id + ) + assert doc_chunks[0].content == "Artificial intelligence (AI) - Introduction\n\n" + assert ( + doc_chunks[1].content == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n" - ) # noqa: E501 - assert chunks[2] == "AI technology is widely used throughout industry, government, and science." # noqa: E501 + ) + assert doc_chunks[2].content == "AI technology is widely used throughout industry, government, and science." assert ( - chunks[3] - == "Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go)." - ) # noqa: E501 + doc_chunks[3].content + == " Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games." + ) + + +@pytest.mark.parametrize("chunk_overlap", [0, 9]) +def test_recursive_chunker_split_document_with_overlap(chunk_overlap): + chunker = RecursiveChunker(chunk_size=20, chunk_overlap=chunk_overlap, separators=[".", " "], keep_separator=True) + text = """A simple sentence.A simple sentence.A simple sentence.A simple sentence""" + + doc = Document(content=text) + doc_chunks = chunker.run([doc]) + if chunker.chunk_overlap == 20: + assert len(doc_chunks) == 4 + for i, chunk in enumerate(doc_chunks): + assert chunk.content == "A simple sentence." + assert chunk.meta["original_id"] == doc.id + + if chunker.chunk_overlap == 9: + assert len(doc_chunks) == 4 + for i, chunk in enumerate(doc_chunks): + assert chunk.meta["original_id"] == doc.id + if i == 0: + assert chunk.content == "A simple sentence." + else: + assert chunk.content == "sentence.A simple sentence." + + +def test_to_dict(): + chunker = RecursiveChunker( + chunk_size=20, chunk_overlap=5, separators=["."], keep_separator=True, is_separator_regex=False + ) + data = chunker.to_dict() + + assert data["type"] == "haystack.components.preprocessors.recursive_chunker.RecursiveChunker" + assert data["init_parameters"]["chunk_size"] == 20 + assert data["init_parameters"]["chunk_overlap"] == 5 + assert data["init_parameters"]["separators"] == ["."] + assert data["init_parameters"]["keep_separator"] is True + assert data["init_parameters"]["is_separator_regex"] is False + + +def test_from_dict(): + data = { + "type": "haystack.components.preprocessors.recursive_chunker.RecursiveChunker", + "init_parameters": { + "chunk_size": 20, + "chunk_overlap": 5, + "separators": ["."], + "keep_separator": True, + "is_separator_regex": False, + }, + } + chunker = RecursiveChunker.from_dict(data) + assert chunker.chunk_size == 20 + assert chunker.chunk_overlap == 5 + assert chunker.separators == ["."] + assert chunker.keep_separator is True + assert chunker.is_separator_regex is False + + +@pytest.mark.integration +def test_recursive_splitter_serialization_in_pipeline(): + pipeline = Pipeline() + pipeline.add_component("chunker", RecursiveChunker(chunk_size=20, chunk_overlap=5, separators=["."])) + pipeline_dict = pipeline.dumps() + new_pipeline = Pipeline.loads(pipeline_dict) + assert pipeline_dict == new_pipeline.dumps() From 74de92ce0d66ded41f4e26ded7f0d55841ade2eb Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 4 Dec 2024 16:17:46 +0100 Subject: [PATCH 09/82] adding release notes --- ...g-recursive-splitter-1fa716fdd77d4d8c.yaml | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 releasenotes/notes/adding-recursive-splitter-1fa716fdd77d4d8c.yaml diff --git a/releasenotes/notes/adding-recursive-splitter-1fa716fdd77d4d8c.yaml b/releasenotes/notes/adding-recursive-splitter-1fa716fdd77d4d8c.yaml new file mode 100644 index 0000000000..ffeefe1890 --- /dev/null +++ b/releasenotes/notes/adding-recursive-splitter-1fa716fdd77d4d8c.yaml @@ -0,0 +1,32 @@ +--- +highlights: > + Replace this text with content to appear at the top of the section for this + release. The highlights might repeat some details that are also present in other notes + from the same release, that's ok. Not every release note requires highlights, + use this section only to describe major features or notable changes. +upgrade: + - | + List upgrade notes here, or remove this section. + Upgrade notes should be rare: only list known/potential breaking changes, + or major changes that require user action before the upgrade. + Notes here must include steps that users can follow to 1. know if they're + affected and 2. handle the change gracefully on their end. +features: + - | + List new features here, or remove this section. +enhancements: + - | + List new behavior that is too small to be + considered a new feature, or remove this section. +issues: + - | + List known issues here, or remove this section. For example, if some change is experimental or known to not work in some cases, it should be mentioned here. +deprecations: + - | + List deprecations notes here, or remove this section. Deprecations should not be used for something that is removed in the release, use upgrade section instead. Deprecation should allow time for users to make necessary changes for the removal to happen in a future release. +security: + - | + Add security notes here, or remove this section. +fixes: + - | + Add normal bug fixes here, or remove this section. From 4054c4733a3465e4c2c5bd3cc2144aaf092088b6 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 4 Dec 2024 16:20:42 +0100 Subject: [PATCH 10/82] adding LICENSE header --- haystack/components/preprocessors/recursive_chunker.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/haystack/components/preprocessors/recursive_chunker.py b/haystack/components/preprocessors/recursive_chunker.py index 06f0e881d2..be9d6c6e53 100644 --- a/haystack/components/preprocessors/recursive_chunker.py +++ b/haystack/components/preprocessors/recursive_chunker.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + import re from typing import Any, Dict, List From 6b72a17260d9c21cfe755e99e8391a5183285083 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 4 Dec 2024 16:34:42 +0100 Subject: [PATCH 11/82] removing unused imports --- haystack/components/preprocessors/recursive_chunker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_chunker.py b/haystack/components/preprocessors/recursive_chunker.py index be9d6c6e53..d9b6454951 100644 --- a/haystack/components/preprocessors/recursive_chunker.py +++ b/haystack/components/preprocessors/recursive_chunker.py @@ -38,7 +38,7 @@ def _check_params(self): @staticmethod def _get_custom_sentence_tokenizer(): try: - from haystack.components.preprocessors.sentence_tokenizer import Language, SentenceSplitter, nltk_imports + from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter except (LookupError, ModuleNotFoundError): raise Exception("You need to install NLTK to use this function. You can install it via `pip install nltk`") return SentenceSplitter(language="en") From 4c0afb1c74c535216b899a83deaea93c2b8d78bf Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 4 Dec 2024 17:36:38 +0100 Subject: [PATCH 12/82] fixing example docstring --- .../preprocessors/recursive_chunker.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/haystack/components/preprocessors/recursive_chunker.py b/haystack/components/preprocessors/recursive_chunker.py index d9b6454951..17f171cd16 100644 --- a/haystack/components/preprocessors/recursive_chunker.py +++ b/haystack/components/preprocessors/recursive_chunker.py @@ -12,6 +12,40 @@ @component class RecursiveChunker: + """ + Recursively chunk text into smaller chunks. + + This component is used to split text into smaller chunks, it does so by recursively applying a list of separators + to the text. + + Each separator is applied to the text, if then checks each of the resulting chunks, it keeps the ones chunks that + are within the chunk_size, for the ones that are larger than the chunk_size, it applies the next separator in the + list to the remaining text. + + This is done until all chunks are smaller than the chunk_size parameter. + + Example: + + ```python + from haystack import Document + from haystack.components.preprocessors.recursive_chunker import RecursiveChunker + + chunker = RecursiveChunker(chunk_size=260, chunk_overlap=0, separators=["\n\n", "\n", ".", " "], keep_separator=True) + text = '''Artificial intelligence (AI) - Introduction + + AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. + AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games.''' + + doc = Document(content=text) + doc_chunks = chunker.run([doc]) + >[ + >Document(id=..., content: 'Artificial intelligence (AI) - Introduction\n\n', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951'}), + >Document(id=..., content: 'AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. ', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951'}), + >Document(id=..., content: 'AI technology is widely used throughout industry, government, and science.', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951'}), + >Document(id=..., content: ' Some high-profile applications include advanced web search engines; recommendation systems; interac...', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951'}) + >] + """ # noqa: E501 + def __init__( # pylint: disable=too-many-positional-arguments self, chunk_size: int, From 8e62968b44d8ebcd3736cf1b727edfd0ae5369f2 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 4 Dec 2024 17:54:10 +0100 Subject: [PATCH 13/82] addding docstrings --- .../components/preprocessors/recursive_chunker.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/haystack/components/preprocessors/recursive_chunker.py b/haystack/components/preprocessors/recursive_chunker.py index 17f171cd16..c1ddb0dc47 100644 --- a/haystack/components/preprocessors/recursive_chunker.py +++ b/haystack/components/preprocessors/recursive_chunker.py @@ -193,9 +193,15 @@ def _run_one(self, doc: Document) -> List[Document]: new_docs.append(new_doc) return new_docs - def run(self, documents: List[Document]) -> List[Document]: + @component.output_types(documents=List[Document]) + def run(self, documents: List[Document]) -> Dict[str, List[Document]]: """ - Split text of documents into smaller chunks recursively. + Split documents into Documents with smaller chunks of text. + + :param documents: List of Documents to split. + :returns: + A dictionary containing a key "documents" with a List of Documents with smaller chunks of text corresponding + to the input documents. """ new_docs = [] for doc in documents: @@ -203,4 +209,4 @@ def run(self, documents: List[Document]) -> List[Document]: logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id) continue new_docs.extend(self._run_one(doc)) - return new_docs + return {"documents": new_docs} From 12549bd582f56eb6fef557fb356a9a22b8088240 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 4 Dec 2024 18:03:34 +0100 Subject: [PATCH 14/82] fixing tests and returning a dictionary --- test/components/preprocessors/test_recursive_chunker.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/components/preprocessors/test_recursive_chunker.py b/test/components/preprocessors/test_recursive_chunker.py index 9ad3b45a58..03971f54e5 100644 --- a/test/components/preprocessors/test_recursive_chunker.py +++ b/test/components/preprocessors/test_recursive_chunker.py @@ -96,6 +96,7 @@ def test_recursive_splitter_empty_documents(): chunker = RecursiveChunker(chunk_size=20, chunk_overlap=0, separators=["."]) empty_doc = Document(content="") doc_chunks = chunker.run([empty_doc]) + doc_chunks = doc_chunks["documents"] assert len(doc_chunks) == 0 @@ -110,6 +111,7 @@ def test_recursive_chunker_with_multiple_separators_recursive(): doc = Document(content=text) doc_chunks = chunker.run([doc]) + doc_chunks = doc_chunks["documents"] assert len(doc_chunks) == 4 assert ( doc_chunks[0].meta["original_id"] @@ -137,6 +139,7 @@ def test_recursive_chunker_split_document_with_overlap(chunk_overlap): doc = Document(content=text) doc_chunks = chunker.run([doc]) + doc_chunks = doc_chunks["documents"] if chunker.chunk_overlap == 20: assert len(doc_chunks) == 4 for i, chunk in enumerate(doc_chunks): From 20a7f527cacbda4cda211d03de2e9e722d3a161c Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 4 Dec 2024 18:35:27 +0100 Subject: [PATCH 15/82] updating release notes --- ...g-recursive-splitter-1fa716fdd77d4d8c.yaml | 30 +------------------ 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/releasenotes/notes/adding-recursive-splitter-1fa716fdd77d4d8c.yaml b/releasenotes/notes/adding-recursive-splitter-1fa716fdd77d4d8c.yaml index ffeefe1890..aea4cd6d69 100644 --- a/releasenotes/notes/adding-recursive-splitter-1fa716fdd77d4d8c.yaml +++ b/releasenotes/notes/adding-recursive-splitter-1fa716fdd77d4d8c.yaml @@ -1,32 +1,4 @@ --- -highlights: > - Replace this text with content to appear at the top of the section for this - release. The highlights might repeat some details that are also present in other notes - from the same release, that's ok. Not every release note requires highlights, - use this section only to describe major features or notable changes. -upgrade: - - | - List upgrade notes here, or remove this section. - Upgrade notes should be rare: only list known/potential breaking changes, - or major changes that require user action before the upgrade. - Notes here must include steps that users can follow to 1. know if they're - affected and 2. handle the change gracefully on their end. features: - | - List new features here, or remove this section. -enhancements: - - | - List new behavior that is too small to be - considered a new feature, or remove this section. -issues: - - | - List known issues here, or remove this section. For example, if some change is experimental or known to not work in some cases, it should be mentioned here. -deprecations: - - | - List deprecations notes here, or remove this section. Deprecations should not be used for something that is removed in the release, use upgrade section instead. Deprecation should allow time for users to make necessary changes for the removal to happen in a future release. -security: - - | - Add security notes here, or remove this section. -fixes: - - | - Add normal bug fixes here, or remove this section. + Adding a `RecursiveChunker,` which uses a set of separators to split text recursively. It attempts to divide the text using the first separator, if the resulting chunks are still larger than the specified size, it moves to the next separator in the list. From 5945e6ddc2bdb4bbb0c17fcef0912d6e277b50ab Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 6 Dec 2024 15:14:29 +0100 Subject: [PATCH 16/82] attending PR comments --- haystack/components/preprocessors/__init__.py | 3 +- ...rsive_chunker.py => recursive_splitter.py} | 82 +++++----- ..._chunker.py => test_recursive_splitter.py} | 146 ++++++++---------- 3 files changed, 103 insertions(+), 128 deletions(-) rename haystack/components/preprocessors/{recursive_chunker.py => recursive_splitter.py} (76%) rename test/components/preprocessors/{test_recursive_chunker.py => test_recursive_splitter.py} (55%) diff --git a/haystack/components/preprocessors/__init__.py b/haystack/components/preprocessors/__init__.py index f7e132077a..33e446e8a6 100644 --- a/haystack/components/preprocessors/__init__.py +++ b/haystack/components/preprocessors/__init__.py @@ -5,6 +5,7 @@ from .document_cleaner import DocumentCleaner from .document_splitter import DocumentSplitter from .nltk_document_splitter import NLTKDocumentSplitter +from .recursive_splitter import RecursiveDocumentSplitter from .text_cleaner import TextCleaner -__all__ = ["DocumentSplitter", "DocumentCleaner", "TextCleaner", "NLTKDocumentSplitter"] +__all__ = ["DocumentSplitter", "DocumentCleaner", "RecursiveDocumentSplitter", "TextCleaner", "NLTKDocumentSplitter"] diff --git a/haystack/components/preprocessors/recursive_chunker.py b/haystack/components/preprocessors/recursive_splitter.py similarity index 76% rename from haystack/components/preprocessors/recursive_chunker.py rename to haystack/components/preprocessors/recursive_splitter.py index c1ddb0dc47..c25e9724d7 100644 --- a/haystack/components/preprocessors/recursive_chunker.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -3,15 +3,15 @@ # SPDX-License-Identifier: Apache-2.0 import re -from typing import Any, Dict, List +from typing import Dict, List, Optional -from haystack import Document, component, default_from_dict, default_to_dict, logging +from haystack import Document, component, logging logger = logging.getLogger(__name__) @component -class RecursiveChunker: +class RecursiveDocumentSplitter: """ Recursively chunk text into smaller chunks. @@ -28,7 +28,7 @@ class RecursiveChunker: ```python from haystack import Document - from haystack.components.preprocessors.recursive_chunker import RecursiveChunker + from haystack.components.preprocessors import RecursiveChunker chunker = RecursiveChunker(chunk_size=260, chunk_overlap=0, separators=["\n\n", "\n", ".", " "], keep_separator=True) text = '''Artificial intelligence (AI) - Introduction @@ -48,26 +48,41 @@ class RecursiveChunker: def __init__( # pylint: disable=too-many-positional-arguments self, - chunk_size: int, - chunk_overlap: int, - separators: List[str], + split_length: int = 200, + split_overlap: int = 0, + separators: Optional[List[str]] = None, keep_separator: bool = True, is_separator_regex: bool = False, ): - self.chunk_size = chunk_size - self.chunk_overlap = chunk_overlap - self.separators = separators + """ + Initializes a RecursiveDocumentSplitter. + + :param split_length: The maximum length of each chunk. + :param split_overlap: The number of characters to overlap between consecutive chunks. + :param separators: A list of separator characters to use for splitting the text. If the separator is "sentence", + the text will be split into sentences using a custom sentence tokenizer based on NLTK. + :param keep_separator: Whether to keep the separator character in the resulting chunks. + :param is_separator_regex: Whether the separator is a regular expression. + + :raises ValueError: If the overlap is greater than or equal to the chunk size or if the overlap is negative, or + if any separator is not a string. + """ + self.split_length = split_length + self.split_overlap = split_overlap + self.separators = separators if separators else ["\n\n", "\n", ".", " "] self.keep_separator = keep_separator self.is_separator_regex = is_separator_regex self._check_params() - if "sentence" in separators: + if separators and "sentence" in separators: self.nltk_tokenizer = self._get_custom_sentence_tokenizer() def _check_params(self): - if self.chunk_overlap < 0: + if self.split_overlap < 0: raise ValueError("Overlap must be greater than zero.") - if self.chunk_overlap >= self.chunk_size: + if self.split_overlap >= self.split_length: raise ValueError("Overlap cannot be greater than or equal to the chunk size.") + if not all(isinstance(separator, str) for separator in self.separators): + raise ValueError("All separators must be strings.") @staticmethod def _get_custom_sentence_tokenizer(): @@ -75,11 +90,11 @@ def _get_custom_sentence_tokenizer(): from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter except (LookupError, ModuleNotFoundError): raise Exception("You need to install NLTK to use this function. You can install it via `pip install nltk`") - return SentenceSplitter(language="en") + return SentenceSplitter() def _apply_overlap(self, chunks: List[str]) -> List[str]: """ - Applies an overlap between consecutive chunks if the chunk_overlap attribute is greater than zero. + Applies an overlap between consecutive chunks if the chunk_overlap attribute is greater than zero. :param chunks: List of text chunks. :returns: @@ -90,7 +105,7 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: if idx == 0: overlapped_chunks.append(chunk) continue - overlap_start = max(0, len(chunks[idx - 1]) - self.chunk_overlap) + overlap_start = max(0, len(chunks[idx - 1]) - self.split_overlap) current_chunk = chunks[idx - 1][overlap_start:] + chunk overlapped_chunks.append(current_chunk) return overlapped_chunks @@ -104,15 +119,16 @@ def _chunk_text(self, text: str) -> List[str]: the next separator in the list. This process continues recursively, progressively applying each specific separator until the chunks meet the desired size criteria. - :param text: + :param text: The text to be split into chunks. :returns: A list of text chunks. """ - if len(text) <= self.chunk_size: + if len(text) <= self.split_length: return [text] + # type ignore below because we already checked that separators is not None # try each separator - for separator in self.separators: + for separator in self.separators: # type: ignore if separator in "sentence": # using nltk sentence tokenizer sentence_with_spans = self.nltk_tokenizer.split_sentences(text) splits = [sentence["sentence"] for sentence in sentence_with_spans] @@ -137,14 +153,14 @@ def _chunk_text(self, text: str) -> List[str]: split_text = split + separator # if adding this split exceeds chunk_size, process current_chunk - if current_length + len(split_text) > self.chunk_size: + if current_length + len(split_text) > self.split_length: if current_chunk: # keep the good splits chunks.append("".join(current_chunk)) current_chunk = [] current_length = 0 # recursively handle splits that are too large - if len(split_text) > self.chunk_size: + if len(split_text) > self.split_length: chunks.extend(self._chunk_text(split_text)) else: chunks.append(split_text) @@ -155,33 +171,13 @@ def _chunk_text(self, text: str) -> List[str]: if current_chunk: chunks.append("".join(current_chunk)) - if self.chunk_overlap > 0: + if self.split_overlap > 0: chunks = self._apply_overlap(chunks) return chunks # if no separator worked, fall back to character-level chunking - return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)] - - def to_dict(self) -> Dict[str, Any]: - """ - Serializes the RecursiveChunker instance to a dictionary. - """ - return default_to_dict( - self, - chunk_size=self.chunk_size, - chunk_overlap=self.chunk_overlap, - separators=self.separators, - keep_separator=self.keep_separator, - is_separator_regex=self.is_separator_regex, - ) - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "RecursiveChunker": - """ - Deserializes a dictionary to a RecursiveChunker instance. - """ - return default_from_dict(cls, data) + return [text[i : i + self.split_length] for i in range(0, len(text), self.split_length - self.split_overlap)] def _run_one(self, doc: Document) -> List[Document]: new_docs = [] diff --git a/test/components/preprocessors/test_recursive_chunker.py b/test/components/preprocessors/test_recursive_splitter.py similarity index 55% rename from test/components/preprocessors/test_recursive_chunker.py rename to test/components/preprocessors/test_recursive_splitter.py index 03971f54e5..b521b4ec21 100644 --- a/test/components/preprocessors/test_recursive_chunker.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -1,67 +1,74 @@ import pytest from haystack import Document, Pipeline -from haystack.components.preprocessors.recursive_chunker import RecursiveChunker +from haystack.components.preprocessors.recursive_splitter import RecursiveDocumentSplitter def test_init_with_negative_overlap(): with pytest.raises(ValueError): - _ = RecursiveChunker(chunk_size=20, chunk_overlap=-1, separators=["."]) + _ = RecursiveDocumentSplitter(split_length=20, split_overlap=-1, separators=["."]) def test_init_with_overlap_greater_than_chunk_size(): with pytest.raises(ValueError): - _ = RecursiveChunker(chunk_size=10, chunk_overlap=15, separators=["."]) + _ = RecursiveDocumentSplitter(split_length=10, split_overlap=15, separators=["."]) + + +def test_init_with_invalid_separators(): + with pytest.raises(ValueError): + _ = RecursiveDocumentSplitter(separators=[".", 2]) def test_apply_overlap_no_overlap(): # Test the case where there is no overlap between chunks - chunker = RecursiveChunker(chunk_size=20, chunk_overlap=0, separators=["."]) + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."]) chunks = ["chunk1", "chunk2", "chunk3"] - result = chunker._apply_overlap(chunks) + result = splitter._apply_overlap(chunks) assert result == ["chunk1", "chunk2", "chunk3"] def test_apply_overlap_with_overlap(): # Test the case where there is overlap between chunks - chunker = RecursiveChunker(chunk_size=20, chunk_overlap=4, separators=["."]) + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=4, separators=["."]) chunks = ["chunk1", "chunk2", "chunk3"] - result = chunker._apply_overlap(chunks) + result = splitter._apply_overlap(chunks) assert result == ["chunk1", "unk1chunk2", "unk2chunk3"] def test_apply_overlap_single_chunk(): # Test the case where there is only one chunk - chunker = RecursiveChunker(chunk_size=20, chunk_overlap=3, separators=["."]) + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=3, separators=["."]) chunks = ["chunk1"] - result = chunker._apply_overlap(chunks) + result = splitter._apply_overlap(chunks) assert result == ["chunk1"] def test_chunk_text_smaller_than_chunk_size(): - chunker = RecursiveChunker(chunk_size=20, chunk_overlap=0, separators=["."]) + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."]) text = "small text" - chunks = chunker._chunk_text(text) + chunks = splitter._chunk_text(text) assert len(chunks) == 1 assert chunks[0] == text -@pytest.mark.parametrize("keep_separator", [True, False]) -def test_chunk_text_with_simple_separator(keep_separator): - chunker = RecursiveChunker(chunk_size=20, chunk_overlap=0, separators=["."], keep_separator=keep_separator) +def test_chunk_text_keep_seperator(): + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."], keep_separator=True) text = "This is a test. Another sentence. And one more." - chunks = chunker._chunk_text(text) + chunks = splitter._chunk_text(text) + assert len(chunks) == 3 + assert chunks[0] == "This is a test." + assert chunks[1] == " Another sentence." + assert chunks[2] == " And one more." - if keep_separator: - assert len(chunks) == 3 - assert chunks[0] == "This is a test." - assert chunks[1] == " Another sentence." - assert chunks[2] == " And one more." - else: - assert len(chunks) == 3 - assert chunks[0] == "This is a test" - assert chunks[1] == " Another sentence" - assert chunks[2] == " And one more" + +def test_chunk_text_do_not_keep_seperator(): + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."], keep_separator=False) + text = "This is a test. Another sentence. And one more." + chunks = splitter._chunk_text(text) + assert len(chunks) == 3 + assert chunks[0] == "This is a test" + assert chunks[1] == " Another sentence" + assert chunks[2] == " And one more" def test_chunk_text_using_nltk_sentence(): @@ -70,15 +77,15 @@ def test_chunk_text_using_nltk_sentence(): requires a more sophisticated sentence tokenizer like the one provided by NLTK. """ - chunker = RecursiveChunker( - chunk_size=400, chunk_overlap=0, separators=["\n\n", "\n", "sentence", " "], keep_separator=True + splitter = RecursiveDocumentSplitter( + split_length=400, split_overlap=0, separators=["\n\n", "\n", "sentence", " "], keep_separator=True ) text = """Artificial intelligence (AI) - Introduction AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go).""" # noqa: E501 - chunks = chunker._chunk_text(text) + chunks = splitter._chunk_text(text) assert len(chunks) == 4 assert chunks[0] == "Artificial intelligence (AI) - Introduction\n\n" assert ( @@ -93,16 +100,16 @@ def test_chunk_text_using_nltk_sentence(): def test_recursive_splitter_empty_documents(): - chunker = RecursiveChunker(chunk_size=20, chunk_overlap=0, separators=["."]) + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."]) empty_doc = Document(content="") - doc_chunks = chunker.run([empty_doc]) + doc_chunks = splitter.run([empty_doc]) doc_chunks = doc_chunks["documents"] assert len(doc_chunks) == 0 def test_recursive_chunker_with_multiple_separators_recursive(): - chunker = RecursiveChunker( - chunk_size=260, chunk_overlap=0, separators=["\n\n", "\n", ".", " "], keep_separator=True + splitter = RecursiveDocumentSplitter( + split_length=260, split_overlap=0, separators=["\n\n", "\n", ".", " "], keep_separator=True ) text = """Artificial intelligence (AI) - Introduction @@ -110,7 +117,7 @@ def test_recursive_chunker_with_multiple_separators_recursive(): AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games.""" # noqa: E501 doc = Document(content=text) - doc_chunks = chunker.run([doc]) + doc_chunks = splitter.run([doc]) doc_chunks = doc_chunks["documents"] assert len(doc_chunks) == 4 assert ( @@ -132,67 +139,38 @@ def test_recursive_chunker_with_multiple_separators_recursive(): ) -@pytest.mark.parametrize("chunk_overlap", [0, 9]) -def test_recursive_chunker_split_document_with_overlap(chunk_overlap): - chunker = RecursiveChunker(chunk_size=20, chunk_overlap=chunk_overlap, separators=[".", " "], keep_separator=True) +def test_recursive_chunker_split_document_with_overlap(): + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=9, separators=[".", " "], keep_separator=True) text = """A simple sentence.A simple sentence.A simple sentence.A simple sentence""" doc = Document(content=text) - doc_chunks = chunker.run([doc]) + doc_chunks = splitter.run([doc]) doc_chunks = doc_chunks["documents"] - if chunker.chunk_overlap == 20: - assert len(doc_chunks) == 4 - for i, chunk in enumerate(doc_chunks): + + assert len(doc_chunks) == 4 + for i, chunk in enumerate(doc_chunks): + assert chunk.meta["original_id"] == doc.id + if i == 0: assert chunk.content == "A simple sentence." - assert chunk.meta["original_id"] == doc.id + else: + assert chunk.content == "sentence.A simple sentence." + - if chunker.chunk_overlap == 9: - assert len(doc_chunks) == 4 - for i, chunk in enumerate(doc_chunks): - assert chunk.meta["original_id"] == doc.id - if i == 0: - assert chunk.content == "A simple sentence." - else: - assert chunk.content == "sentence.A simple sentence." +def test_recursive_splitter_no_seperator_used_and_no_overlap(): + splitter = RecursiveDocumentSplitter(split_length=18, split_overlap=0, separators=["!", "-"], keep_separator=True) + text = """A simple sentence.A simple sentence.A simple sentence.A simple sentence.""" + doc = Document(content=text) + doc_chunks = splitter.run([doc]) + doc_chunks = doc_chunks["documents"] + assert len(doc_chunks) == 4 + for i, chunk in enumerate(doc_chunks): + assert chunk.meta["original_id"] == doc.id + assert chunk.content == "A simple sentence." -def test_to_dict(): - chunker = RecursiveChunker( - chunk_size=20, chunk_overlap=5, separators=["."], keep_separator=True, is_separator_regex=False - ) - data = chunker.to_dict() - - assert data["type"] == "haystack.components.preprocessors.recursive_chunker.RecursiveChunker" - assert data["init_parameters"]["chunk_size"] == 20 - assert data["init_parameters"]["chunk_overlap"] == 5 - assert data["init_parameters"]["separators"] == ["."] - assert data["init_parameters"]["keep_separator"] is True - assert data["init_parameters"]["is_separator_regex"] is False - - -def test_from_dict(): - data = { - "type": "haystack.components.preprocessors.recursive_chunker.RecursiveChunker", - "init_parameters": { - "chunk_size": 20, - "chunk_overlap": 5, - "separators": ["."], - "keep_separator": True, - "is_separator_regex": False, - }, - } - chunker = RecursiveChunker.from_dict(data) - assert chunker.chunk_size == 20 - assert chunker.chunk_overlap == 5 - assert chunker.separators == ["."] - assert chunker.keep_separator is True - assert chunker.is_separator_regex is False - - -@pytest.mark.integration def test_recursive_splitter_serialization_in_pipeline(): pipeline = Pipeline() - pipeline.add_component("chunker", RecursiveChunker(chunk_size=20, chunk_overlap=5, separators=["."])) + pipeline.add_component("chunker", RecursiveDocumentSplitter(split_length=20, split_overlap=5, separators=["."])) pipeline_dict = pipeline.dumps() new_pipeline = Pipeline.loads(pipeline_dict) assert pipeline_dict == new_pipeline.dumps() From b5391f6864cc07e29c51e325f1f82471f012c9f1 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 10 Dec 2024 11:07:32 +0100 Subject: [PATCH 17/82] Update haystack/components/preprocessors/recursive_splitter.py Co-authored-by: Sebastian Husch Lee --- haystack/components/preprocessors/recursive_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index c25e9724d7..43ced1c5e1 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -18,7 +18,7 @@ class RecursiveDocumentSplitter: This component is used to split text into smaller chunks, it does so by recursively applying a list of separators to the text. - Each separator is applied to the text, if then checks each of the resulting chunks, it keeps the ones chunks that + Each separator is applied to the text, it then checks each of the resulting chunks, it keeps the chunks that are within the chunk_size, for the ones that are larger than the chunk_size, it applies the next separator in the list to the remaining text. From adf1b1a921a041782b0e2abad7accd13ea7f5f4b Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 10 Dec 2024 17:03:08 +0100 Subject: [PATCH 18/82] wip: updating tests for split_idx_start and _split_overlap --- .../preprocessors/recursive_splitter.py | 61 +++++++++++++------ .../preprocessors/test_recursive_splitter.py | 51 +++++++++++++--- 2 files changed, 83 insertions(+), 29 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 43ced1c5e1..105a56e496 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -1,8 +1,8 @@ # SPDX-FileCopyrightText: 2022-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 - import re +from copy import deepcopy from typing import Dict, List, Optional from haystack import Document, component, logging @@ -52,17 +52,17 @@ def __init__( # pylint: disable=too-many-positional-arguments split_overlap: int = 0, separators: Optional[List[str]] = None, keep_separator: bool = True, - is_separator_regex: bool = False, ): """ Initializes a RecursiveDocumentSplitter. :param split_length: The maximum length of each chunk. :param split_overlap: The number of characters to overlap between consecutive chunks. - :param separators: A list of separator characters to use for splitting the text. If the separator is "sentence", - the text will be split into sentences using a custom sentence tokenizer based on NLTK. + :param separators: An optional list of separator strings to use for splitting the text. The string + separators will be treated as regular expressions un less if the separator is "sentence", in that case the + text will be split into sentences using a custom sentence tokenizer based on NLTK. + If no separators are provided, the default separators ["\n\n", "\n", ".", " "] are used. :param keep_separator: Whether to keep the separator character in the resulting chunks. - :param is_separator_regex: Whether the separator is a regular expression. :raises ValueError: If the overlap is greater than or equal to the chunk size or if the overlap is negative, or if any separator is not a string. @@ -71,9 +71,8 @@ def __init__( # pylint: disable=too-many-positional-arguments self.split_overlap = split_overlap self.separators = separators if separators else ["\n\n", "\n", ".", " "] self.keep_separator = keep_separator - self.is_separator_regex = is_separator_regex self._check_params() - if separators and "sentence" in separators: + if "sentence" in self.separators: self.nltk_tokenizer = self._get_custom_sentence_tokenizer() def _check_params(self): @@ -129,15 +128,13 @@ def _chunk_text(self, text: str) -> List[str]: # type ignore below because we already checked that separators is not None # try each separator for separator in self.separators: # type: ignore - if separator in "sentence": # using nltk sentence tokenizer + if separator == "sentence": + # using the custom NLTK-based sentence tokenizer sentence_with_spans = self.nltk_tokenizer.split_sentences(text) splits = [sentence["sentence"] for sentence in sentence_with_spans] else: - # split using the current separator - splits = text.split(separator) if not self.is_separator_regex else re.split(separator, text) - - # filter out empty splits - splits = [s for s in splits if s.strip()] + # apply current separator regex to split text + splits = re.split(re.escape(separator), text) if len(splits) == 1: # go to next separator, if current separator not found continue @@ -147,9 +144,11 @@ def _chunk_text(self, text: str) -> List[str]: current_length = 0 # check splits, if any is too long, recursively chunk it, otherwise add to current chunk - for split in splits: + for idx, split in enumerate(splits): split_text = split - if self.keep_separator and separator != "sentence": + + # add separator to the split, if it's not the last one + if self.keep_separator and separator != "sentence" and idx < len(splits) - 1: split_text = split + separator # if adding this split exceeds chunk_size, process current_chunk @@ -183,10 +182,31 @@ def _run_one(self, doc: Document) -> List[Document]: new_docs = [] # NOTE: the check for a non-empty content is already done in the run method, hence the type ignore below chunks = self._chunk_text(doc.content) # type: ignore - for chunk in chunks: - new_doc = Document(content=chunk, meta=doc.meta) + current_position = 0 + for split_nr, chunk in enumerate(chunks): + new_doc = Document(content=chunk, meta=deepcopy(doc.meta)) new_doc.meta["original_id"] = doc.id + new_doc.meta["split_id"] = split_nr + new_doc.meta["split_idx_start"] = current_position + new_doc.meta["_split_overlap"] = [] + + if split_nr > 0 and self.split_overlap > 0: + previous_doc = new_docs[-1] + overlap_length = len(previous_doc.content) - (current_position - previous_doc.meta["split_idx_start"]) + if overlap_length > 0: + # overlap info to previous document + previous_doc.meta["_split_overlap"].append( + { + "doc_id": new_doc.id, + "range": (len(previous_doc.content) - overlap_length, len(previous_doc.content)), + } + ) + # overlap info to current document + new_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": (0, overlap_length)}) + new_docs.append(new_doc) + current_position += len(chunk) - (self.split_overlap if split_nr < len(chunks) - 1 else 0) + return new_docs @component.output_types(documents=List[Document]) @@ -199,10 +219,11 @@ def run(self, documents: List[Document]) -> Dict[str, List[Document]]: A dictionary containing a key "documents" with a List of Documents with smaller chunks of text corresponding to the input documents. """ - new_docs = [] + docs = [] for doc in documents: if not doc.content or doc.content == "": logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id) continue - new_docs.extend(self._run_one(doc)) - return {"documents": new_docs} + docs.extend(self._run_one(doc)) + + return {"documents": docs} diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index b521b4ec21..b513d3af4c 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -51,7 +51,7 @@ def test_chunk_text_smaller_than_chunk_size(): assert chunks[0] == text -def test_chunk_text_keep_seperator(): +def test_chunk_text_keep_separator(): splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."], keep_separator=True) text = "This is a test. Another sentence. And one more." chunks = splitter._chunk_text(text) @@ -71,6 +71,15 @@ def test_chunk_text_do_not_keep_seperator(): assert chunks[2] == " And one more" +def test_keep_separator_chunks_are_equal(): + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."], keep_separator=True) + text = "This is a test.This is a test" + chunks = splitter._chunk_text(text) + assert len(chunks) == 2 + assert chunks[0] == "This is a test." + assert chunks[1] == "This is a test" + + def test_chunk_text_using_nltk_sentence(): """ This test includes abbreviations that are not handled by the simple sentence tokenizer based on "." and @@ -141,22 +150,46 @@ def test_recursive_chunker_with_multiple_separators_recursive(): def test_recursive_chunker_split_document_with_overlap(): splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=9, separators=[".", " "], keep_separator=True) - text = """A simple sentence.A simple sentence.A simple sentence.A simple sentence""" + text = """A simple sentence.A bright sentence.A clever sentence.A joyful sentence""" doc = Document(content=text) doc_chunks = splitter.run([doc]) doc_chunks = doc_chunks["documents"] + print("\n") + for doc in doc_chunks: + print(doc.id) + print(doc.content) + print(doc.meta) + print("-------") + assert len(doc_chunks) == 4 - for i, chunk in enumerate(doc_chunks): - assert chunk.meta["original_id"] == doc.id - if i == 0: - assert chunk.content == "A simple sentence." - else: - assert chunk.content == "sentence.A simple sentence." + + assert doc_chunks[0].content == "A simple sentence." + assert doc_chunks[0].meta["split_id"] == 0 + assert doc_chunks[0].meta["split_idx_start"] == 0 + assert doc_chunks[0].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (9, 18)}] + + assert doc_chunks[1].content == "sentence.A bright sentence." + assert doc_chunks[1].meta["split_id"] == 1 + assert doc_chunks[1].meta["split_idx_start"] == 9 + assert doc_chunks[1].meta["_split_overlap"] == [ + {"doc_id": doc_chunks[0].id, "range": (0, 9)}, + {"doc_id": doc_chunks[2].id, "range": (18, 27)}, + ] + + assert doc_chunks[2].content == "sentence.A clever sentence." + assert doc_chunks[2].meta["split_id"] == 2 + assert doc_chunks[2].meta["split_idx_start"] == 18 + assert doc_chunks[2].meta["_split_overlap"] == [ + {"doc_id": doc_chunks[1].id, "range": (0, 9)}, + {"doc_id": doc_chunks[3].id, "range": (27, 36)}, + ] + + # assert doc_chunks[3].content == "sentence.A joyful sentence" -def test_recursive_splitter_no_seperator_used_and_no_overlap(): +def test_recursive_splitter_no_separator_used_and_no_overlap(): splitter = RecursiveDocumentSplitter(split_length=18, split_overlap=0, separators=["!", "-"], keep_separator=True) text = """A simple sentence.A simple sentence.A simple sentence.A simple sentence.""" doc = Document(content=text) From d4a2a0b40b952ae6d4a9e810dccf355d6c735ddc Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 11 Dec 2024 11:23:42 +0100 Subject: [PATCH 19/82] adding tests for split_idx and split_start and overlaps --- .../preprocessors/recursive_splitter.py | 13 +++--- .../preprocessors/test_recursive_splitter.py | 44 ++++++++++--------- 2 files changed, 31 insertions(+), 26 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 105a56e496..b1cdedcc09 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -179,9 +179,10 @@ def _chunk_text(self, text: str) -> List[str]: return [text[i : i + self.split_length] for i in range(0, len(text), self.split_length - self.split_overlap)] def _run_one(self, doc: Document) -> List[Document]: - new_docs = [] + new_docs: List[Document] = [] # NOTE: the check for a non-empty content is already done in the run method, hence the type ignore below chunks = self._chunk_text(doc.content) # type: ignore + chunks = chunks[:-1] if len(chunks[-1]) == 0 else chunks # remove last empty chunk current_position = 0 for split_nr, chunk in enumerate(chunks): new_doc = Document(content=chunk, meta=deepcopy(doc.meta)) @@ -194,15 +195,15 @@ def _run_one(self, doc: Document) -> List[Document]: previous_doc = new_docs[-1] overlap_length = len(previous_doc.content) - (current_position - previous_doc.meta["split_idx_start"]) if overlap_length > 0: - # overlap info to previous document - previous_doc.meta["_split_overlap"].append( + # previous document + previous_doc.meta["_split_overlap"].append({"doc_id": new_doc.id, "range": (0, overlap_length)}) + # current document + new_doc.meta["_split_overlap"].append( { - "doc_id": new_doc.id, + "doc_id": previous_doc.id, "range": (len(previous_doc.content) - overlap_length, len(previous_doc.content)), } ) - # overlap info to current document - new_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": (0, overlap_length)}) new_docs.append(new_doc) current_position += len(chunk) - (self.split_overlap if split_nr < len(chunks) - 1 else 0) diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index b513d3af4c..fd225a0b8b 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -128,6 +128,7 @@ def test_recursive_chunker_with_multiple_separators_recursive(): doc = Document(content=text) doc_chunks = splitter.run([doc]) doc_chunks = doc_chunks["documents"] + assert len(doc_chunks) == 4 assert ( doc_chunks[0].meta["original_id"] @@ -149,44 +150,47 @@ def test_recursive_chunker_with_multiple_separators_recursive(): def test_recursive_chunker_split_document_with_overlap(): - splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=9, separators=[".", " "], keep_separator=True) - text = """A simple sentence.A bright sentence.A clever sentence.A joyful sentence""" + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=11, separators=[".", " "], keep_separator=True) + text = """A simple sentence1. A bright sentence2. A clever sentence3. A joyful sentence4""" doc = Document(content=text) doc_chunks = splitter.run([doc]) doc_chunks = doc_chunks["documents"] - print("\n") - for doc in doc_chunks: - print(doc.id) - print(doc.content) - print(doc.meta) - print("-------") - assert len(doc_chunks) == 4 + assert ( + doc_chunks[0].meta["original_id"] + == doc_chunks[1].meta["original_id"] + == doc_chunks[2].meta["original_id"] + == doc_chunks[3].meta["original_id"] + == doc.id + ) - assert doc_chunks[0].content == "A simple sentence." + assert doc_chunks[0].content == "A simple sentence1." assert doc_chunks[0].meta["split_id"] == 0 assert doc_chunks[0].meta["split_idx_start"] == 0 - assert doc_chunks[0].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (9, 18)}] + assert doc_chunks[0].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (0, 11)}] - assert doc_chunks[1].content == "sentence.A bright sentence." + assert doc_chunks[1].content == " sentence1. A bright sentence2." assert doc_chunks[1].meta["split_id"] == 1 - assert doc_chunks[1].meta["split_idx_start"] == 9 + assert doc_chunks[1].meta["split_idx_start"] == 8 assert doc_chunks[1].meta["_split_overlap"] == [ - {"doc_id": doc_chunks[0].id, "range": (0, 9)}, - {"doc_id": doc_chunks[2].id, "range": (18, 27)}, + {"doc_id": doc_chunks[0].id, "range": (8, 19)}, + {"doc_id": doc_chunks[2].id, "range": (0, 11)}, ] - assert doc_chunks[2].content == "sentence.A clever sentence." + assert doc_chunks[2].content == " sentence2. A clever sentence3." assert doc_chunks[2].meta["split_id"] == 2 - assert doc_chunks[2].meta["split_idx_start"] == 18 + assert doc_chunks[2].meta["split_idx_start"] == 28 assert doc_chunks[2].meta["_split_overlap"] == [ - {"doc_id": doc_chunks[1].id, "range": (0, 9)}, - {"doc_id": doc_chunks[3].id, "range": (27, 36)}, + {"doc_id": doc_chunks[1].id, "range": (20, 31)}, + {"doc_id": doc_chunks[3].id, "range": (0, 11)}, ] - # assert doc_chunks[3].content == "sentence.A joyful sentence" + assert doc_chunks[3].content == " sentence3. A joyful sentence4" + assert doc_chunks[3].meta["split_id"] == 3 + assert doc_chunks[3].meta["split_idx_start"] == 48 + assert doc_chunks[3].meta["_split_overlap"] == [{"doc_id": doc_chunks[2].id, "range": (20, 31)}] def test_recursive_splitter_no_separator_used_and_no_overlap(): From aed28c55e441bf672412fa8b348ca6a478ec202d Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 11 Dec 2024 11:25:25 +0100 Subject: [PATCH 20/82] adjusting file for LICENSE checking --- haystack/components/preprocessors/recursive_splitter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index b1cdedcc09..28f5bb39b3 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: 2022-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 + import re from copy import deepcopy from typing import Dict, List, Optional From 824142ff1065ae318b650d43e6724603b27adea7 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 11 Dec 2024 11:55:03 +0100 Subject: [PATCH 21/82] adding more tests --- haystack/components/preprocessors/recursive_splitter.py | 2 -- test/components/preprocessors/test_recursive_splitter.py | 6 ++++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 28f5bb39b3..9f503f2670 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -196,9 +196,7 @@ def _run_one(self, doc: Document) -> List[Document]: previous_doc = new_docs[-1] overlap_length = len(previous_doc.content) - (current_position - previous_doc.meta["split_idx_start"]) if overlap_length > 0: - # previous document previous_doc.meta["_split_overlap"].append({"doc_id": new_doc.id, "range": (0, overlap_length)}) - # current document new_doc.meta["_split_overlap"].append( { "doc_id": previous_doc.id, diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index fd225a0b8b..e1a906e90b 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -2,6 +2,12 @@ from haystack import Document, Pipeline from haystack.components.preprocessors.recursive_splitter import RecursiveDocumentSplitter +from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter + + +def test_get_custom_sentence_tokenizer_success(): + tokenizer = RecursiveDocumentSplitter._get_custom_sentence_tokenizer() + assert isinstance(tokenizer, SentenceSplitter) def test_init_with_negative_overlap(): From e4815d8430542fd76f16698d596b6f2483e1ae6e Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 11 Dec 2024 14:15:04 +0100 Subject: [PATCH 22/82] adding tests for page numbering --- .../preprocessors/recursive_splitter.py | 24 +++++++----- .../preprocessors/test_recursive_splitter.py | 37 +++++++++++++++++++ 2 files changed, 52 insertions(+), 9 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 9f503f2670..5fab439e5a 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -126,9 +126,7 @@ def _chunk_text(self, text: str) -> List[str]: if len(text) <= self.split_length: return [text] - # type ignore below because we already checked that separators is not None - # try each separator - for separator in self.separators: # type: ignore + for separator in self.separators: # type: ignore # the caller already checked that separators is not None if separator == "sentence": # using the custom NLTK-based sentence tokenizer sentence_with_spans = self.nltk_tokenizer.split_sentences(text) @@ -137,6 +135,8 @@ def _chunk_text(self, text: str) -> List[str]: # apply current separator regex to split text splits = re.split(re.escape(separator), text) + print("DEBUG", splits) + if len(splits) == 1: # go to next separator, if current separator not found continue @@ -148,10 +148,13 @@ def _chunk_text(self, text: str) -> List[str]: for idx, split in enumerate(splits): split_text = split - # add separator to the split, if it's not the last one + # add separator to the split, if it's not the last split if self.keep_separator and separator != "sentence" and idx < len(splits) - 1: split_text = split + separator + print("DEBUG", split_text) + print(current_length, len(split_text), self.split_length) + # if adding this split exceeds chunk_size, process current_chunk if current_length + len(split_text) > self.split_length: if current_chunk: # keep the good splits @@ -181,30 +184,33 @@ def _chunk_text(self, text: str) -> List[str]: def _run_one(self, doc: Document) -> List[Document]: new_docs: List[Document] = [] - # NOTE: the check for a non-empty content is already done in the run method, hence the type ignore below - chunks = self._chunk_text(doc.content) # type: ignore + chunks = self._chunk_text(doc.content) # type: ignore # the caller already check for a non-empty doc.content chunks = chunks[:-1] if len(chunks[-1]) == 0 else chunks # remove last empty chunk current_position = 0 + current_page = 1 + for split_nr, chunk in enumerate(chunks): new_doc = Document(content=chunk, meta=deepcopy(doc.meta)) new_doc.meta["original_id"] = doc.id new_doc.meta["split_id"] = split_nr new_doc.meta["split_idx_start"] = current_position - new_doc.meta["_split_overlap"] = [] + new_doc.meta["_split_overlap"] = [] if self.split_overlap > 0 else None + new_doc.meta["page_number"] = current_page if split_nr > 0 and self.split_overlap > 0: previous_doc = new_docs[-1] - overlap_length = len(previous_doc.content) - (current_position - previous_doc.meta["split_idx_start"]) + overlap_length = len(previous_doc.content) - (current_position - previous_doc.meta["split_idx_start"]) # type: ignore if overlap_length > 0: previous_doc.meta["_split_overlap"].append({"doc_id": new_doc.id, "range": (0, overlap_length)}) new_doc.meta["_split_overlap"].append( { "doc_id": previous_doc.id, - "range": (len(previous_doc.content) - overlap_length, len(previous_doc.content)), + "range": (len(previous_doc.content) - overlap_length, len(previous_doc.content)), # type: ignore } ) new_docs.append(new_doc) + current_page += chunk.count("\f") # update the page number based on the number of page breaks current_position += len(chunk) - (self.split_overlap if split_nr < len(chunks) - 1 else 0) return new_docs diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index e1a906e90b..78c28e1f2e 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -199,6 +199,43 @@ def test_recursive_chunker_split_document_with_overlap(): assert doc_chunks[3].meta["_split_overlap"] == [{"doc_id": doc_chunks[2].id, "range": (20, 31)}] +def test_recursive_splitter_generate_pages(): + splitter = RecursiveDocumentSplitter(split_length=18, split_overlap=0, separators=[" "], keep_separator=True) + doc = Document(content="This is some text. \f This text is on another page. \f This is the last page.") + doc_chunks = splitter.run([doc]) + doc_chunks = doc_chunks["documents"] + assert len(doc_chunks) == 7 + for doc in doc_chunks: + if doc.meta["split_id"] in [0, 1, 2]: + assert doc.meta["page_number"] == 1 + if doc.meta["split_id"] in [3, 4]: + assert doc.meta["page_number"] == 2 + if doc.meta["split_id"] in [5, 6]: + assert doc.meta["page_number"] == 3 + + +def test_recursive_splitter_split_length_too_small(): + # ToDo: the splitter should raise an error if the split_length is too small, i.e.: cannot be split into chunks of + # the desired length + pass + # ToDo: Add test for the case where the splitter generates pages + """ + splitter = DocumentSplitter(split_by="word", split_length=2) + doc1 = Document(content="This is some text.\f This text is on another page.") + doc2 = Document(content="This content has two.\f\f page brakes.") + result = splitter.run(documents=[doc1, doc2]) + + expected_pages = [1, 1, 2, 2, 2, 1, 1, 3] + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + """ + + +def test_recursive_splitter_generate_empty_chunks(): + # ToDo: Add test for the case where the splitter generates empty chunks + pass + + def test_recursive_splitter_no_separator_used_and_no_overlap(): splitter = RecursiveDocumentSplitter(split_length=18, split_overlap=0, separators=["!", "-"], keep_separator=True) text = """A simple sentence.A simple sentence.A simple sentence.A simple sentence.""" From 8f1ae366efae130c615d34d7901295543dc7cf73 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 11 Dec 2024 17:26:52 +0100 Subject: [PATCH 23/82] adding tests for min split lenghts and falling back to character-level chunking based on size --- .../preprocessors/recursive_splitter.py | 30 ++++++---- .../preprocessors/test_recursive_splitter.py | 58 ++++++++++--------- 2 files changed, 48 insertions(+), 40 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 5fab439e5a..5d4d73afac 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -19,6 +19,9 @@ class RecursiveDocumentSplitter: This component is used to split text into smaller chunks, it does so by recursively applying a list of separators to the text. + The separators are applied in the order they are provided, typically this is a list of separators that are + applied in a specific order, being the last separator the most specific one. + Each separator is applied to the text, it then checks each of the resulting chunks, it keeps the chunks that are within the chunk_size, for the ones that are larger than the chunk_size, it applies the next separator in the list to the remaining text. @@ -77,6 +80,8 @@ def __init__( # pylint: disable=too-many-positional-arguments self.nltk_tokenizer = self._get_custom_sentence_tokenizer() def _check_params(self): + if self.split_length < 1: + raise ValueError("Split length must be at least 1 character.") if self.split_overlap < 0: raise ValueError("Overlap must be greater than zero.") if self.split_overlap >= self.split_length: @@ -126,16 +131,15 @@ def _chunk_text(self, text: str) -> List[str]: if len(text) <= self.split_length: return [text] - for separator in self.separators: # type: ignore # the caller already checked that separators is not None - if separator == "sentence": + for curr_separator in self.separators: # type: ignore # the caller already checked that separators is not None + if curr_separator == "sentence": # using the custom NLTK-based sentence tokenizer sentence_with_spans = self.nltk_tokenizer.split_sentences(text) splits = [sentence["sentence"] for sentence in sentence_with_spans] else: # apply current separator regex to split text - splits = re.split(re.escape(separator), text) - - print("DEBUG", splits) + escaped_separator = re.escape(curr_separator) + splits = re.split(escaped_separator, text) if len(splits) == 1: # go to next separator, if current separator not found continue @@ -149,11 +153,8 @@ def _chunk_text(self, text: str) -> List[str]: split_text = split # add separator to the split, if it's not the last split - if self.keep_separator and separator != "sentence" and idx < len(splits) - 1: - split_text = split + separator - - print("DEBUG", split_text) - print(current_length, len(split_text), self.split_length) + if self.keep_separator and curr_separator != "sentence" and idx < len(splits) - 1: + split_text = split + curr_separator # if adding this split exceeds chunk_size, process current_chunk if current_length + len(split_text) > self.split_length: @@ -164,7 +165,11 @@ def _chunk_text(self, text: str) -> List[str]: # recursively handle splits that are too large if len(split_text) > self.split_length: - chunks.extend(self._chunk_text(split_text)) + if curr_separator == self.separators[-1]: + # tried the last separator, can't split further, fall back to character-level chunking + break + else: + chunks.extend(self._chunk_text(split_text)) else: chunks.append(split_text) else: @@ -177,7 +182,8 @@ def _chunk_text(self, text: str) -> List[str]: if self.split_overlap > 0: chunks = self._apply_overlap(chunks) - return chunks + if chunks: + return chunks # if no separator worked, fall back to character-level chunking return [text[i : i + self.split_length] for i in range(0, len(text), self.split_length - self.split_overlap)] diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 78c28e1f2e..3d797c0134 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -25,6 +25,11 @@ def test_init_with_invalid_separators(): _ = RecursiveDocumentSplitter(separators=[".", 2]) +def test_init_with_negative_split_length(): + with pytest.raises(ValueError): + _ = RecursiveDocumentSplitter(split_length=-1, separators=["."]) + + def test_apply_overlap_no_overlap(): # Test the case where there is no overlap between chunks splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."]) @@ -214,38 +219,35 @@ def test_recursive_splitter_generate_pages(): assert doc.meta["page_number"] == 3 -def test_recursive_splitter_split_length_too_small(): - # ToDo: the splitter should raise an error if the split_length is too small, i.e.: cannot be split into chunks of - # the desired length - pass - # ToDo: Add test for the case where the splitter generates pages - """ - splitter = DocumentSplitter(split_by="word", split_length=2) - doc1 = Document(content="This is some text.\f This text is on another page.") - doc2 = Document(content="This content has two.\f\f page brakes.") - result = splitter.run(documents=[doc1, doc2]) - - expected_pages = [1, 1, 2, 2, 2, 1, 1, 3] - for doc, p in zip(result["documents"], expected_pages): - assert doc.meta["page_number"] == p - """ +def test_recursive_splitter_separator_exists_but_split_length_too_small_fall_back_to_character_chunking(): + splitter = RecursiveDocumentSplitter(separators=[" "], split_length=2) + doc = Document(content="This is some text. This is some more text.") + result = splitter.run(documents=[doc]) + assert len(result["documents"]) == 21 + for doc in result["documents"]: + assert len(doc.content) == 2 def test_recursive_splitter_generate_empty_chunks(): - # ToDo: Add test for the case where the splitter generates empty chunks - pass - - -def test_recursive_splitter_no_separator_used_and_no_overlap(): - splitter = RecursiveDocumentSplitter(split_length=18, split_overlap=0, separators=["!", "-"], keep_separator=True) - text = """A simple sentence.A simple sentence.A simple sentence.A simple sentence.""" + splitter = RecursiveDocumentSplitter(split_length=15, separators=["\n\n", "\n"], keep_separator=False) + text = "This is a test.\n\n\nAnother test.\n\n\n\nFinal test." doc = Document(content=text) - doc_chunks = splitter.run([doc]) - doc_chunks = doc_chunks["documents"] - assert len(doc_chunks) == 4 - for i, chunk in enumerate(doc_chunks): - assert chunk.meta["original_id"] == doc.id - assert chunk.content == "A simple sentence." + chunks = splitter.run([doc])["documents"] + + assert chunks[0].content == "This is a test." + assert chunks[1].content == "\nAnother test." + assert chunks[2].content == "Final test." + + +# def test_recursive_splitter_fallback_to_character_chunking(): +# text = "abczdefzghizjkl" +# separators = ["\n\n", "\n", "z"] +# splitter = RecursiveDocumentSplitter(split_length=2, separators=separators, keep_separator=False) +# +# doc = Document(content=text) +# chunks = splitter.run([doc])["documents"] +# for chunk in chunks: +# print(chunk.content) def test_recursive_splitter_serialization_in_pipeline(): From 5a49eab1c970486e511b56ab7084765f7894bd88 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 11 Dec 2024 17:40:45 +0100 Subject: [PATCH 24/82] fixing linting issue --- haystack/components/preprocessors/recursive_splitter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 5d4d73afac..8f0889d11a 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -166,10 +166,10 @@ def _chunk_text(self, text: str) -> List[str]: # recursively handle splits that are too large if len(split_text) > self.split_length: if curr_separator == self.separators[-1]: - # tried the last separator, can't split further, fall back to character-level chunking + # tried the last separator, can't split further, break the loop and fall back to + # character-level chunking break - else: - chunks.extend(self._chunk_text(split_text)) + chunks.extend(self._chunk_text(split_text)) else: chunks.append(split_text) else: From a5c1f2c5965eb3aaf2edbfb989f2a81581944adb Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Dec 2024 14:05:11 +0100 Subject: [PATCH 25/82] Update haystack/components/preprocessors/recursive_splitter.py Co-authored-by: Sebastian Husch Lee --- haystack/components/preprocessors/recursive_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 8f0889d11a..2bfb41f8e8 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -32,7 +32,7 @@ class RecursiveDocumentSplitter: ```python from haystack import Document - from haystack.components.preprocessors import RecursiveChunker + from haystack.components.preprocessors import RecursiveDocumentSplitter chunker = RecursiveChunker(chunk_size=260, chunk_overlap=0, separators=["\n\n", "\n", ".", " "], keep_separator=True) text = '''Artificial intelligence (AI) - Introduction From 2248135b1f02f390d971039b32e10f6d28f90e5f Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Dec 2024 14:05:20 +0100 Subject: [PATCH 26/82] Update haystack/components/preprocessors/recursive_splitter.py Co-authored-by: Sebastian Husch Lee --- haystack/components/preprocessors/recursive_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 2bfb41f8e8..687d10b09f 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -34,7 +34,7 @@ class RecursiveDocumentSplitter: from haystack import Document from haystack.components.preprocessors import RecursiveDocumentSplitter - chunker = RecursiveChunker(chunk_size=260, chunk_overlap=0, separators=["\n\n", "\n", ".", " "], keep_separator=True) + chunker = RecursiveDocumentSplitter(split_length=260, split_overlap=0, separators=["\n\n", "\n", ".", " "], keep_separator=True) text = '''Artificial intelligence (AI) - Introduction AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. From 4263352d7b3a973d104cf3d78cccc4a4d97776c2 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Dec 2024 14:05:27 +0100 Subject: [PATCH 27/82] Update haystack/components/preprocessors/recursive_splitter.py Co-authored-by: Sebastian Husch Lee --- haystack/components/preprocessors/recursive_splitter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 687d10b09f..4b5234218e 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -42,6 +42,7 @@ class RecursiveDocumentSplitter: doc = Document(content=text) doc_chunks = chunker.run([doc]) + print(doc_chunks["documents"]) >[ >Document(id=..., content: 'Artificial intelligence (AI) - Introduction\n\n', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951'}), >Document(id=..., content: 'AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. ', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951'}), From 6ee55513eef776da130d6d2ed4d5fb3058203687 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Dec 2024 14:05:38 +0100 Subject: [PATCH 28/82] Update haystack/components/preprocessors/recursive_splitter.py Co-authored-by: Sebastian Husch Lee --- haystack/components/preprocessors/recursive_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 4b5234218e..a1030a3dce 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -44,7 +44,7 @@ class RecursiveDocumentSplitter: doc_chunks = chunker.run([doc]) print(doc_chunks["documents"]) >[ - >Document(id=..., content: 'Artificial intelligence (AI) - Introduction\n\n', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951'}), + >Document(id=..., content: 'Artificial intelligence (AI) - Introduction\n\n', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951', 'split_id': 0, 'split_idx_start': 0, '_split_overlap': []}) >Document(id=..., content: 'AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. ', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951'}), >Document(id=..., content: 'AI technology is widely used throughout industry, government, and science.', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951'}), >Document(id=..., content: ' Some high-profile applications include advanced web search engines; recommendation systems; interac...', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951'}) From 0325a8b4be48881e5cd94d5740860f1ad2c032c5 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Dec 2024 14:05:47 +0100 Subject: [PATCH 29/82] Update haystack/components/preprocessors/recursive_splitter.py Co-authored-by: Sebastian Husch Lee --- haystack/components/preprocessors/recursive_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index a1030a3dce..bc92047024 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -45,7 +45,7 @@ class RecursiveDocumentSplitter: print(doc_chunks["documents"]) >[ >Document(id=..., content: 'Artificial intelligence (AI) - Introduction\n\n', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951', 'split_id': 0, 'split_idx_start': 0, '_split_overlap': []}) - >Document(id=..., content: 'AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. ', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951'}), + >Document(id=..., content: 'AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951', 'split_id': 1, 'split_idx_start': 45, '_split_overlap': []}) >Document(id=..., content: 'AI technology is widely used throughout industry, government, and science.', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951'}), >Document(id=..., content: ' Some high-profile applications include advanced web search engines; recommendation systems; interac...', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951'}) >] From b2b94b53e81b989296d083781c1d0833bc236dbb Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Dec 2024 14:05:54 +0100 Subject: [PATCH 30/82] Update haystack/components/preprocessors/recursive_splitter.py Co-authored-by: Sebastian Husch Lee --- haystack/components/preprocessors/recursive_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index bc92047024..19b420c059 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -46,7 +46,7 @@ class RecursiveDocumentSplitter: >[ >Document(id=..., content: 'Artificial intelligence (AI) - Introduction\n\n', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951', 'split_id': 0, 'split_idx_start': 0, '_split_overlap': []}) >Document(id=..., content: 'AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951', 'split_id': 1, 'split_idx_start': 45, '_split_overlap': []}) - >Document(id=..., content: 'AI technology is widely used throughout industry, government, and science.', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951'}), + >Document(id=..., content: 'AI technology is widely used throughout industry, government, and science.', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951', 'split_id': 2, 'split_idx_start': 142, '_split_overlap': []}) >Document(id=..., content: ' Some high-profile applications include advanced web search engines; recommendation systems; interac...', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951'}) >] """ # noqa: E501 From 85f2ea2b669b623a9c02aa31b709f1884ce78096 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Dec 2024 14:06:31 +0100 Subject: [PATCH 31/82] Update haystack/components/preprocessors/recursive_splitter.py Co-authored-by: Sebastian Husch Lee --- haystack/components/preprocessors/recursive_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 19b420c059..9986654058 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -142,7 +142,7 @@ def _chunk_text(self, text: str) -> List[str]: escaped_separator = re.escape(curr_separator) splits = re.split(escaped_separator, text) - if len(splits) == 1: # go to next separator, if current separator not found + if len(splits) == 1: # go to next separator, if current separator not found in the text continue chunks = [] From 644056f5ee8006577d67196d217b171a7fc96763 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Dec 2024 14:06:40 +0100 Subject: [PATCH 32/82] Update haystack/components/preprocessors/recursive_splitter.py Co-authored-by: Sebastian Husch Lee --- haystack/components/preprocessors/recursive_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 9986654058..b707fdf714 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -47,7 +47,7 @@ class RecursiveDocumentSplitter: >Document(id=..., content: 'Artificial intelligence (AI) - Introduction\n\n', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951', 'split_id': 0, 'split_idx_start': 0, '_split_overlap': []}) >Document(id=..., content: 'AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951', 'split_id': 1, 'split_idx_start': 45, '_split_overlap': []}) >Document(id=..., content: 'AI technology is widely used throughout industry, government, and science.', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951', 'split_id': 2, 'split_idx_start': 142, '_split_overlap': []}) - >Document(id=..., content: ' Some high-profile applications include advanced web search engines; recommendation systems; interac...', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951'}) + >Document(id=..., content: ' Some high-profile applications include advanced web search engines; recommendation systems; interac...', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951', 'split_id': 3, 'split_idx_start': 216, '_split_overlap': []}) >] """ # noqa: E501 From 3cb85d918964dd5d839fe786416083c27ca0170d Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Dec 2024 15:23:55 +0100 Subject: [PATCH 33/82] wip --- .../preprocessors/recursive_splitter.py | 31 ++++++----- .../preprocessors/test_recursive_splitter.py | 54 ++++++------------- 2 files changed, 33 insertions(+), 52 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index b707fdf714..e573f616a2 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -34,7 +34,7 @@ class RecursiveDocumentSplitter: from haystack import Document from haystack.components.preprocessors import RecursiveDocumentSplitter - chunker = RecursiveDocumentSplitter(split_length=260, split_overlap=0, separators=["\n\n", "\n", ".", " "], keep_separator=True) + chunker = RecursiveDocumentSplitter(split_length=260, split_overlap=0, separators=["\n\n", "\n", ".", " "]) text = '''Artificial intelligence (AI) - Introduction AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. @@ -52,11 +52,7 @@ class RecursiveDocumentSplitter: """ # noqa: E501 def __init__( # pylint: disable=too-many-positional-arguments - self, - split_length: int = 200, - split_overlap: int = 0, - separators: Optional[List[str]] = None, - keep_separator: bool = True, + self, split_length: int = 200, split_overlap: int = 0, separators: Optional[List[str]] = None ): """ Initializes a RecursiveDocumentSplitter. @@ -67,7 +63,6 @@ def __init__( # pylint: disable=too-many-positional-arguments separators will be treated as regular expressions un less if the separator is "sentence", in that case the text will be split into sentences using a custom sentence tokenizer based on NLTK. If no separators are provided, the default separators ["\n\n", "\n", ".", " "] are used. - :param keep_separator: Whether to keep the separator character in the resulting chunks. :raises ValueError: If the overlap is greater than or equal to the chunk size or if the overlap is negative, or if any separator is not a string. @@ -75,7 +70,6 @@ def __init__( # pylint: disable=too-many-positional-arguments self.split_length = split_length self.split_overlap = split_overlap self.separators = separators if separators else ["\n\n", "\n", ".", " "] - self.keep_separator = keep_separator self._check_params() if "sentence" in self.separators: self.nltk_tokenizer = self._get_custom_sentence_tokenizer() @@ -137,11 +131,26 @@ def _chunk_text(self, text: str) -> List[str]: # using the custom NLTK-based sentence tokenizer sentence_with_spans = self.nltk_tokenizer.split_sentences(text) splits = [sentence["sentence"] for sentence in sentence_with_spans] + + print("") + for idx, split in enumerate(splits): + print(f"idx: {idx}, split: {split}") + else: - # apply current separator regex to split text + # using the separator as a regex escaped_separator = re.escape(curr_separator) + escaped_separator = ( + f"({escaped_separator})" # wrap the separator in a group to include it in the splits + ) splits = re.split(escaped_separator, text) + # add the separator to the end of the previous split + splits = [splits[i] + splits[i + 1] for i in range(0, len(splits) - 1, 2)] + + # print("") + # for idx, split in enumerate(splits): + # print(f"idx: {idx}, split: {split}") + if len(splits) == 1: # go to next separator, if current separator not found in the text continue @@ -153,10 +162,6 @@ def _chunk_text(self, text: str) -> List[str]: for idx, split in enumerate(splits): split_text = split - # add separator to the split, if it's not the last split - if self.keep_separator and curr_separator != "sentence" and idx < len(splits) - 1: - split_text = split + curr_separator - # if adding this split exceeds chunk_size, process current_chunk if current_length + len(split_text) > self.split_length: if current_chunk: # keep the good splits diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 3d797c0134..7314792d4f 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -62,8 +62,8 @@ def test_chunk_text_smaller_than_chunk_size(): assert chunks[0] == text -def test_chunk_text_keep_separator(): - splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."], keep_separator=True) +def test_chunk_split_by_period(): + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."]) text = "This is a test. Another sentence. And one more." chunks = splitter._chunk_text(text) assert len(chunks) == 3 @@ -72,34 +72,13 @@ def test_chunk_text_keep_separator(): assert chunks[2] == " And one more." -def test_chunk_text_do_not_keep_seperator(): - splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."], keep_separator=False) - text = "This is a test. Another sentence. And one more." - chunks = splitter._chunk_text(text) - assert len(chunks) == 3 - assert chunks[0] == "This is a test" - assert chunks[1] == " Another sentence" - assert chunks[2] == " And one more" - - -def test_keep_separator_chunks_are_equal(): - splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."], keep_separator=True) - text = "This is a test.This is a test" - chunks = splitter._chunk_text(text) - assert len(chunks) == 2 - assert chunks[0] == "This is a test." - assert chunks[1] == "This is a test" - - def test_chunk_text_using_nltk_sentence(): """ This test includes abbreviations that are not handled by the simple sentence tokenizer based on "." and requires a more sophisticated sentence tokenizer like the one provided by NLTK. """ - splitter = RecursiveDocumentSplitter( - split_length=400, split_overlap=0, separators=["\n\n", "\n", "sentence", " "], keep_separator=True - ) + splitter = RecursiveDocumentSplitter(split_length=400, split_overlap=0, separators=["\n\n", "\n", "sentence", " "]) text = """Artificial intelligence (AI) - Introduction AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. @@ -128,9 +107,7 @@ def test_recursive_splitter_empty_documents(): def test_recursive_chunker_with_multiple_separators_recursive(): - splitter = RecursiveDocumentSplitter( - split_length=260, split_overlap=0, separators=["\n\n", "\n", ".", " "], keep_separator=True - ) + splitter = RecursiveDocumentSplitter(split_length=260, split_overlap=0, separators=["\n\n", "\n", ".", " "]) text = """Artificial intelligence (AI) - Introduction AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. @@ -161,7 +138,7 @@ def test_recursive_chunker_with_multiple_separators_recursive(): def test_recursive_chunker_split_document_with_overlap(): - splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=11, separators=[".", " "], keep_separator=True) + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=11, separators=[".", " "]) text = """A simple sentence1. A bright sentence2. A clever sentence3. A joyful sentence4""" doc = Document(content=text) @@ -205,7 +182,7 @@ def test_recursive_chunker_split_document_with_overlap(): def test_recursive_splitter_generate_pages(): - splitter = RecursiveDocumentSplitter(split_length=18, split_overlap=0, separators=[" "], keep_separator=True) + splitter = RecursiveDocumentSplitter(split_length=18, split_overlap=0, separators=[" "]) doc = Document(content="This is some text. \f This text is on another page. \f This is the last page.") doc_chunks = splitter.run([doc]) doc_chunks = doc_chunks["documents"] @@ -229,7 +206,7 @@ def test_recursive_splitter_separator_exists_but_split_length_too_small_fall_bac def test_recursive_splitter_generate_empty_chunks(): - splitter = RecursiveDocumentSplitter(split_length=15, separators=["\n\n", "\n"], keep_separator=False) + splitter = RecursiveDocumentSplitter(split_length=15, separators=["\n\n", "\n"]) text = "This is a test.\n\n\nAnother test.\n\n\n\nFinal test." doc = Document(content=text) chunks = splitter.run([doc])["documents"] @@ -239,15 +216,14 @@ def test_recursive_splitter_generate_empty_chunks(): assert chunks[2].content == "Final test." -# def test_recursive_splitter_fallback_to_character_chunking(): -# text = "abczdefzghizjkl" -# separators = ["\n\n", "\n", "z"] -# splitter = RecursiveDocumentSplitter(split_length=2, separators=separators, keep_separator=False) -# -# doc = Document(content=text) -# chunks = splitter.run([doc])["documents"] -# for chunk in chunks: -# print(chunk.content) +def test_recursive_splitter_fallback_to_character_chunking(): + text = "abczdefzghizjkl" + separators = ["\n\n", "\n", "z"] + splitter = RecursiveDocumentSplitter(split_length=2, separators=separators) + doc = Document(content=text) + chunks = splitter.run([doc])["documents"] + for chunk in chunks: + assert len(chunk.content) <= 2 def test_recursive_splitter_serialization_in_pipeline(): From 42faf0559fcd2116f6acb5396750c4e66ea65e6e Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Dec 2024 15:53:05 +0100 Subject: [PATCH 34/82] wip --- .../preprocessors/recursive_splitter.py | 3 +- .../preprocessors/test_recursive_splitter.py | 35 ++++++------------- 2 files changed, 11 insertions(+), 27 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index e573f616a2..e2e61bd571 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -69,7 +69,7 @@ def __init__( # pylint: disable=too-many-positional-arguments """ self.split_length = split_length self.split_overlap = split_overlap - self.separators = separators if separators else ["\n\n", "\n", ".", " "] + self.separators = separators if separators else ["\n\n", "sentence", "\n", " "] # default separators self._check_params() if "sentence" in self.separators: self.nltk_tokenizer = self._get_custom_sentence_tokenizer() @@ -203,7 +203,6 @@ def _run_one(self, doc: Document) -> List[Document]: for split_nr, chunk in enumerate(chunks): new_doc = Document(content=chunk, meta=deepcopy(doc.meta)) - new_doc.meta["original_id"] = doc.id new_doc.meta["split_id"] = split_nr new_doc.meta["split_idx_start"] = current_position new_doc.meta["_split_overlap"] = [] if self.split_overlap > 0 else None diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 7314792d4f..5acf6273d6 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -72,13 +72,21 @@ def test_chunk_split_by_period(): assert chunks[2] == " And one more." +def test_recursive_splitter_empty_documents(): + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."]) + empty_doc = Document(content="") + doc_chunks = splitter.run([empty_doc]) + doc_chunks = doc_chunks["documents"] + assert len(doc_chunks) == 0 + + def test_chunk_text_using_nltk_sentence(): """ This test includes abbreviations that are not handled by the simple sentence tokenizer based on "." and requires a more sophisticated sentence tokenizer like the one provided by NLTK. """ - splitter = RecursiveDocumentSplitter(split_length=400, split_overlap=0, separators=["\n\n", "\n", "sentence", " "]) + splitter = RecursiveDocumentSplitter(split_length=400, split_overlap=0, separators=["\n\n", "sentence", "\n", " "]) text = """Artificial intelligence (AI) - Introduction AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. @@ -98,16 +106,8 @@ def test_chunk_text_using_nltk_sentence(): ) # noqa: E501 -def test_recursive_splitter_empty_documents(): - splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."]) - empty_doc = Document(content="") - doc_chunks = splitter.run([empty_doc]) - doc_chunks = doc_chunks["documents"] - assert len(doc_chunks) == 0 - - def test_recursive_chunker_with_multiple_separators_recursive(): - splitter = RecursiveDocumentSplitter(split_length=260, split_overlap=0, separators=["\n\n", "\n", ".", " "]) + splitter = RecursiveDocumentSplitter(split_length=260, split_overlap=0, separators=["\n\n", "\n", "sentence", " "]) text = """Artificial intelligence (AI) - Introduction AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. @@ -118,13 +118,6 @@ def test_recursive_chunker_with_multiple_separators_recursive(): doc_chunks = doc_chunks["documents"] assert len(doc_chunks) == 4 - assert ( - doc_chunks[0].meta["original_id"] - == doc_chunks[1].meta["original_id"] - == doc_chunks[2].meta["original_id"] - == doc_chunks[3].meta["original_id"] - == doc.id - ) assert doc_chunks[0].content == "Artificial intelligence (AI) - Introduction\n\n" assert ( doc_chunks[1].content @@ -146,13 +139,6 @@ def test_recursive_chunker_split_document_with_overlap(): doc_chunks = doc_chunks["documents"] assert len(doc_chunks) == 4 - assert ( - doc_chunks[0].meta["original_id"] - == doc_chunks[1].meta["original_id"] - == doc_chunks[2].meta["original_id"] - == doc_chunks[3].meta["original_id"] - == doc.id - ) assert doc_chunks[0].content == "A simple sentence1." assert doc_chunks[0].meta["split_id"] == 0 @@ -210,7 +196,6 @@ def test_recursive_splitter_generate_empty_chunks(): text = "This is a test.\n\n\nAnother test.\n\n\n\nFinal test." doc = Document(content=text) chunks = splitter.run([doc])["documents"] - assert chunks[0].content == "This is a test." assert chunks[1].content == "\nAnother test." assert chunks[2].content == "Final test." From 7d9c4df05e4342d464cbacc057c43a7623bc404e Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Dec 2024 17:08:22 +0100 Subject: [PATCH 35/82] updating tests --- .../preprocessors/recursive_splitter.py | 17 +-- .../preprocessors/test_recursive_splitter.py | 122 +++++++++--------- 2 files changed, 68 insertions(+), 71 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index e2e61bd571..dbb266e7de 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -131,11 +131,6 @@ def _chunk_text(self, text: str) -> List[str]: # using the custom NLTK-based sentence tokenizer sentence_with_spans = self.nltk_tokenizer.split_sentences(text) splits = [sentence["sentence"] for sentence in sentence_with_spans] - - print("") - for idx, split in enumerate(splits): - print(f"idx: {idx}, split: {split}") - else: # using the separator as a regex escaped_separator = re.escape(curr_separator) @@ -144,12 +139,14 @@ def _chunk_text(self, text: str) -> List[str]: ) splits = re.split(escaped_separator, text) - # add the separator to the end of the previous split - splits = [splits[i] + splits[i + 1] for i in range(0, len(splits) - 1, 2)] + # merge every two consecutive splits (i.e., the ext and the separator after it) + splits = [ + "".join([splits[i], splits[i + 1]]) if i < len(splits) - 1 else splits[i] + for i in range(0, len(splits), 2) + ] - # print("") - # for idx, split in enumerate(splits): - # print(f"idx: {idx}, split: {split}") + # remove last split if it is empty + splits = splits[:-1] if splits[-1] == "" else splits if len(splits) == 1: # go to next separator, if current separator not found in the text continue diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 5acf6273d6..82b17c6749 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -62,7 +62,7 @@ def test_chunk_text_smaller_than_chunk_size(): assert chunks[0] == text -def test_chunk_split_by_period(): +def test_chunk_text_by_period(): splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."]) text = "This is a test. Another sentence. And one more." chunks = splitter._chunk_text(text) @@ -80,54 +80,64 @@ def test_recursive_splitter_empty_documents(): assert len(doc_chunks) == 0 -def test_chunk_text_using_nltk_sentence(): - """ - This test includes abbreviations that are not handled by the simple sentence tokenizer based on "." and - requires a more sophisticated sentence tokenizer like the one provided by NLTK. - """ - - splitter = RecursiveDocumentSplitter(split_length=400, split_overlap=0, separators=["\n\n", "sentence", "\n", " "]) - text = """Artificial intelligence (AI) - Introduction - -AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. -AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go).""" # noqa: E501 - - chunks = splitter._chunk_text(text) - assert len(chunks) == 4 - assert chunks[0] == "Artificial intelligence (AI) - Introduction\n\n" - assert ( - chunks[1] - == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n" - ) # noqa: E501 - assert chunks[2] == "AI technology is widely used throughout industry, government, and science." # noqa: E501 - assert ( - chunks[3] - == "Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go)." - ) # noqa: E501 - - -def test_recursive_chunker_with_multiple_separators_recursive(): - splitter = RecursiveDocumentSplitter(split_length=260, split_overlap=0, separators=["\n\n", "\n", "sentence", " "]) - text = """Artificial intelligence (AI) - Introduction - -AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. -AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games.""" # noqa: E501 - +def test_recursive_splitter_generate_empty_chunks(): + splitter = RecursiveDocumentSplitter(split_length=15, separators=["\n\n", "\n"]) + text = "This is a test.\n\n\nAnother test.\n\n\n\nFinal test." doc = Document(content=text) - doc_chunks = splitter.run([doc]) - doc_chunks = doc_chunks["documents"] + chunks = splitter.run([doc])["documents"] + assert chunks[0].content == "This is a test." + assert chunks[1].content == "\nAnother test." + assert chunks[2].content == "Final test." - assert len(doc_chunks) == 4 - assert doc_chunks[0].content == "Artificial intelligence (AI) - Introduction\n\n" - assert ( - doc_chunks[1].content - == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n" - ) - assert doc_chunks[2].content == "AI technology is widely used throughout industry, government, and science." - assert ( - doc_chunks[3].content - == " Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games." - ) + +# def test_chunk_text_using_nltk_sentence(): +# """ +# This test includes abbreviations that are not handled by the simple sentence tokenizer based on "." and +# requires a more sophisticated sentence tokenizer like the one provided by NLTK. +# """ +# +# splitter = RecursiveDocumentSplitter(split_length=400, split_overlap=0, separators=["\n\n", "\n", ".", " "]) +# text = """Artificial intelligence (AI) - Introduction +# +# AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. +# AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go).""" # noqa: E501 +# +# chunks = splitter._chunk_text(text) +# assert len(chunks) == 4 +# assert chunks[0] == "Artificial intelligence (AI) - Introduction\n\n" +# assert ( +# chunks[1] +# == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n" +# ) # noqa: E501 +# assert chunks[2] == "AI technology is widely used throughout industry, government, and science." # noqa: E501 +# assert ( +# chunks[3] +# == "Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go)." +# ) # noqa: E501 + + +# def test_recursive_chunker_with_multiple_separators_recursive(): +# splitter = RecursiveDocumentSplitter(split_length=260, split_overlap=0, separators=["\n\n", "\n", "sentence", " "]) +# text = """Artificial intelligence (AI) - Introduction +# +# AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. +# AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games.""" # noqa: E501 +# +# doc = Document(content=text) +# doc_chunks = splitter.run([doc]) +# doc_chunks = doc_chunks["documents"] +# +# assert len(doc_chunks) == 4 +# assert doc_chunks[0].content == "Artificial intelligence (AI) - Introduction\n\n" +# assert ( +# doc_chunks[1].content +# == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n" +# ) +# assert doc_chunks[2].content == "AI technology is widely used throughout industry, government, and science." +# assert ( +# doc_chunks[3].content +# == " Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games." +# ) def test_recursive_chunker_split_document_with_overlap(): @@ -142,12 +152,12 @@ def test_recursive_chunker_split_document_with_overlap(): assert doc_chunks[0].content == "A simple sentence1." assert doc_chunks[0].meta["split_id"] == 0 - assert doc_chunks[0].meta["split_idx_start"] == 0 + assert doc_chunks[0].meta["split_idx_start"] == text.index(doc_chunks[0].content) assert doc_chunks[0].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (0, 11)}] assert doc_chunks[1].content == " sentence1. A bright sentence2." assert doc_chunks[1].meta["split_id"] == 1 - assert doc_chunks[1].meta["split_idx_start"] == 8 + assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content) assert doc_chunks[1].meta["_split_overlap"] == [ {"doc_id": doc_chunks[0].id, "range": (8, 19)}, {"doc_id": doc_chunks[2].id, "range": (0, 11)}, @@ -155,7 +165,7 @@ def test_recursive_chunker_split_document_with_overlap(): assert doc_chunks[2].content == " sentence2. A clever sentence3." assert doc_chunks[2].meta["split_id"] == 2 - assert doc_chunks[2].meta["split_idx_start"] == 28 + assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content) assert doc_chunks[2].meta["_split_overlap"] == [ {"doc_id": doc_chunks[1].id, "range": (20, 31)}, {"doc_id": doc_chunks[3].id, "range": (0, 11)}, @@ -163,7 +173,7 @@ def test_recursive_chunker_split_document_with_overlap(): assert doc_chunks[3].content == " sentence3. A joyful sentence4" assert doc_chunks[3].meta["split_id"] == 3 - assert doc_chunks[3].meta["split_idx_start"] == 48 + assert doc_chunks[3].meta["split_idx_start"] == text.index(doc_chunks[3].content) assert doc_chunks[3].meta["_split_overlap"] == [{"doc_id": doc_chunks[2].id, "range": (20, 31)}] @@ -191,16 +201,6 @@ def test_recursive_splitter_separator_exists_but_split_length_too_small_fall_bac assert len(doc.content) == 2 -def test_recursive_splitter_generate_empty_chunks(): - splitter = RecursiveDocumentSplitter(split_length=15, separators=["\n\n", "\n"]) - text = "This is a test.\n\n\nAnother test.\n\n\n\nFinal test." - doc = Document(content=text) - chunks = splitter.run([doc])["documents"] - assert chunks[0].content == "This is a test." - assert chunks[1].content == "\nAnother test." - assert chunks[2].content == "Final test." - - def test_recursive_splitter_fallback_to_character_chunking(): text = "abczdefzghizjkl" separators = ["\n\n", "\n", "z"] From 5bcf709f46055916caa27ff7e5ad36c1b5688186 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Dec 2024 18:04:03 +0100 Subject: [PATCH 36/82] wip: fixing all tests after changes --- .../preprocessors/recursive_splitter.py | 4 +- .../preprocessors/test_recursive_splitter.py | 105 ++++++++---------- 2 files changed, 45 insertions(+), 64 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index dbb266e7de..8965f1a5ae 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -128,18 +128,16 @@ def _chunk_text(self, text: str) -> List[str]: for curr_separator in self.separators: # type: ignore # the caller already checked that separators is not None if curr_separator == "sentence": - # using the custom NLTK-based sentence tokenizer sentence_with_spans = self.nltk_tokenizer.split_sentences(text) splits = [sentence["sentence"] for sentence in sentence_with_spans] else: - # using the separator as a regex escaped_separator = re.escape(curr_separator) escaped_separator = ( f"({escaped_separator})" # wrap the separator in a group to include it in the splits ) splits = re.split(escaped_separator, text) - # merge every two consecutive splits (i.e., the ext and the separator after it) + # merge every two consecutive splits, i.e.: the text and the separator after it splits = [ "".join([splits[i], splits[i + 1]]) if i < len(splits) - 1 else splits[i] for i in range(0, len(splits), 2) diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 82b17c6749..4d118032b5 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -1,4 +1,5 @@ import pytest +from pytest import LogCaptureFixture from haystack import Document, Pipeline from haystack.components.preprocessors.recursive_splitter import RecursiveDocumentSplitter @@ -72,72 +73,54 @@ def test_chunk_text_by_period(): assert chunks[2] == " And one more." -def test_recursive_splitter_empty_documents(): +def test_recursive_splitter_multiple_new_lines(): + splitter = RecursiveDocumentSplitter(split_length=20, separators=["\n\n", "\n"]) + text = "This is a test.\n\n\nAnother test.\n\n\n\nFinal test." + doc = Document(content=text) + chunks = splitter.run([doc])["documents"] + assert chunks[0].content == "This is a test.\n\n" + assert chunks[1].content == "\nAnother test.\n\n" + assert chunks[2].content == "\n\nFinal test." + + +def test_recursive_splitter_empty_documents(caplog: LogCaptureFixture): splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."]) empty_doc = Document(content="") doc_chunks = splitter.run([empty_doc]) doc_chunks = doc_chunks["documents"] assert len(doc_chunks) == 0 - - -def test_recursive_splitter_generate_empty_chunks(): - splitter = RecursiveDocumentSplitter(split_length=15, separators=["\n\n", "\n"]) - text = "This is a test.\n\n\nAnother test.\n\n\n\nFinal test." - doc = Document(content=text) - chunks = splitter.run([doc])["documents"] - assert chunks[0].content == "This is a test." - assert chunks[1].content == "\nAnother test." - assert chunks[2].content == "Final test." - - -# def test_chunk_text_using_nltk_sentence(): -# """ -# This test includes abbreviations that are not handled by the simple sentence tokenizer based on "." and -# requires a more sophisticated sentence tokenizer like the one provided by NLTK. -# """ -# -# splitter = RecursiveDocumentSplitter(split_length=400, split_overlap=0, separators=["\n\n", "\n", ".", " "]) -# text = """Artificial intelligence (AI) - Introduction -# -# AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. -# AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go).""" # noqa: E501 -# -# chunks = splitter._chunk_text(text) -# assert len(chunks) == 4 -# assert chunks[0] == "Artificial intelligence (AI) - Introduction\n\n" -# assert ( -# chunks[1] -# == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n" -# ) # noqa: E501 -# assert chunks[2] == "AI technology is widely used throughout industry, government, and science." # noqa: E501 -# assert ( -# chunks[3] -# == "Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go)." -# ) # noqa: E501 - - -# def test_recursive_chunker_with_multiple_separators_recursive(): -# splitter = RecursiveDocumentSplitter(split_length=260, split_overlap=0, separators=["\n\n", "\n", "sentence", " "]) -# text = """Artificial intelligence (AI) - Introduction -# -# AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. -# AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games.""" # noqa: E501 -# -# doc = Document(content=text) -# doc_chunks = splitter.run([doc]) -# doc_chunks = doc_chunks["documents"] -# -# assert len(doc_chunks) == 4 -# assert doc_chunks[0].content == "Artificial intelligence (AI) - Introduction\n\n" -# assert ( -# doc_chunks[1].content -# == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n" -# ) -# assert doc_chunks[2].content == "AI technology is widely used throughout industry, government, and science." -# assert ( -# doc_chunks[3].content -# == " Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games." -# ) + assert "has an empty content. Skipping this document." in caplog.text + + +def test_recursive_splitter_using_custom_sentence_tokenizer(): + """ + This test includes abbreviations that are not handled by the simple sentence tokenizer based on "." and requires a + more sophisticated sentence tokenizer like the one provided by NLTK. + """ + splitter = RecursiveDocumentSplitter(split_length=400, split_overlap=0, separators=["\n\n", "\n", "sentence", " "]) + text = """Artificial intelligence (AI) - Introduction + +AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. +AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go).""" # noqa: E501 + + chunks = splitter.run([Document(content=text)]) + chunks = chunks["documents"] + assert len(chunks) == 4 + assert chunks[0].content == "Artificial intelligence (AI) - Introduction\n\n" + assert ( + chunks[1].content + == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n" + ) # noqa: E501 + assert chunks[2].content == "AI technology is widely used throughout industry, government, and science." # noqa: E501 + assert ( + chunks[3].content + == "Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go)." + ) # noqa: E501 + + +def test_recursive_splitter_custom_sentence_tokenizer_document_and_overlap(): + # ToDo: + pass def test_recursive_chunker_split_document_with_overlap(): From 9205ef2adc3387e62f0306315591a83ad5196b60 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Dec 2024 18:18:13 +0100 Subject: [PATCH 37/82] more tests --- .../components/preprocessors/recursive_splitter.py | 2 +- .../preprocessors/test_recursive_splitter.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 8965f1a5ae..0d386afda8 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -154,7 +154,7 @@ def _chunk_text(self, text: str) -> List[str]: current_length = 0 # check splits, if any is too long, recursively chunk it, otherwise add to current chunk - for idx, split in enumerate(splits): + for split in splits: split_text = split # if adding this split exceeds chunk_size, process current_chunk diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 4d118032b5..3fcfedf150 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -39,7 +39,7 @@ def test_apply_overlap_no_overlap(): assert result == ["chunk1", "chunk2", "chunk3"] -def test_apply_overlap_with_overlap(): +def test_apply_overlap_with_overlap_case_1(): # Test the case where there is overlap between chunks splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=4, separators=["."]) chunks = ["chunk1", "chunk2", "chunk3"] @@ -47,6 +47,14 @@ def test_apply_overlap_with_overlap(): assert result == ["chunk1", "unk1chunk2", "unk2chunk3"] +def test_apply_overlap_with_overlap_case_2(): + # Test the case where there is overlap between chunks + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=6, separators=["."]) + chunks = ["chunk1", "chunk2", "chunk3", "chunk4"] + result = splitter._apply_overlap(chunks) + assert result == ["chunk1", "chunk1chunk2", "chunk2chunk3", "chunk3chunk4"] + + def test_apply_overlap_single_chunk(): # Test the case where there is only one chunk splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=3, separators=["."]) From 437570f69670ca64996d48244b6e2dc517fc5bb2 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Dec 2024 18:38:45 +0100 Subject: [PATCH 38/82] wip: debugging sentence overlap --- .../components/preprocessors/recursive_splitter.py | 11 +++++++++++ .../preprocessors/test_recursive_splitter.py | 6 ++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 0d386afda8..ac350c5bb6 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -130,6 +130,12 @@ def _chunk_text(self, text: str) -> List[str]: if curr_separator == "sentence": sentence_with_spans = self.nltk_tokenizer.split_sentences(text) splits = [sentence["sentence"] for sentence in sentence_with_spans] + + print("\n") + + for i, sentence in enumerate(splits): + print(f"Sentence {i}: {sentence}") + else: escaped_separator = re.escape(curr_separator) escaped_separator = ( @@ -153,8 +159,13 @@ def _chunk_text(self, text: str) -> List[str]: current_chunk: List[str] = [] current_length = 0 + print("\n\n") + # check splits, if any is too long, recursively chunk it, otherwise add to current chunk for split in splits: + print("in loop") + print(f"Split: {split}") + split_text = split # if adding this split exceeds chunk_size, process current_chunk diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 3fcfedf150..07dc567c87 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -127,8 +127,10 @@ def test_recursive_splitter_using_custom_sentence_tokenizer(): def test_recursive_splitter_custom_sentence_tokenizer_document_and_overlap(): - # ToDo: - pass + splitter = RecursiveDocumentSplitter(split_length=40, split_overlap=10, separators=["sentence"]) + text = "I must not fear. Fear is the mind-killer. Fear is the little-death that brings total obliteration. I will face my fear. I will permit it to pass over me and through me. And when it has gone past I will turn the inner eye to see its path. Where the fear has gone there will be nothing. Only I will remain." + chunks = splitter.run([Document(content=text)]) + # ToDo def test_recursive_chunker_split_document_with_overlap(): From 97437d82719893dfa79613bb15612e431f41ff88 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 13 Dec 2024 17:56:33 +0100 Subject: [PATCH 39/82] wip: debugging page number --- .../preprocessors/recursive_splitter.py | 28 +-- .../preprocessors/test_recursive_splitter.py | 197 +++++++++++++++--- 2 files changed, 187 insertions(+), 38 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index ac350c5bb6..404a81e5c8 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -130,12 +130,6 @@ def _chunk_text(self, text: str) -> List[str]: if curr_separator == "sentence": sentence_with_spans = self.nltk_tokenizer.split_sentences(text) splits = [sentence["sentence"] for sentence in sentence_with_spans] - - print("\n") - - for i, sentence in enumerate(splits): - print(f"Sentence {i}: {sentence}") - else: escaped_separator = re.escape(curr_separator) escaped_separator = ( @@ -159,15 +153,9 @@ def _chunk_text(self, text: str) -> List[str]: current_chunk: List[str] = [] current_length = 0 - print("\n\n") - # check splits, if any is too long, recursively chunk it, otherwise add to current chunk for split in splits: - print("in loop") - print(f"Split: {split}") - split_text = split - # if adding this split exceeds chunk_size, process current_chunk if current_length + len(split_text) > self.split_length: if current_chunk: # keep the good splits @@ -212,7 +200,6 @@ def _run_one(self, doc: Document) -> List[Document]: new_doc.meta["split_id"] = split_nr new_doc.meta["split_idx_start"] = current_position new_doc.meta["_split_overlap"] = [] if self.split_overlap > 0 else None - new_doc.meta["page_number"] = current_page if split_nr > 0 and self.split_overlap > 0: previous_doc = new_docs[-1] @@ -226,8 +213,21 @@ def _run_one(self, doc: Document) -> List[Document]: } ) + # for the case where a chunk ends with one or multiple consecutive page breaks + # page_breaks_at_end = 0 + # for i in range(1, len(chunk) + 1): + # if ord(chunk[-i]) == 12: # ASCII value for form feed, which is used as a page break + # page_breaks_at_end += 1 + # if page_breaks_at_end > 0 and current_page > 1: + # new_doc.meta["page_number"] = current_page - page_breaks_at_end + # else: + # new_doc.meta["page_number"] = current_page + + new_doc.meta["page_number"] = current_page + current_page += chunk.count("\f") # count page breaks in the chunk + + # keep the new chunk doc and update the current position new_docs.append(new_doc) - current_page += chunk.count("\f") # update the page number based on the number of page breaks current_position += len(chunk) - (self.split_overlap if split_nr < len(chunks) - 1 else 0) return new_docs diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 07dc567c87..a37eee83a2 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -47,8 +47,8 @@ def test_apply_overlap_with_overlap_case_1(): assert result == ["chunk1", "unk1chunk2", "unk2chunk3"] -def test_apply_overlap_with_overlap_case_2(): - # Test the case where there is overlap between chunks +# ToDo: update this test, result above is not the expected one +def ignore_test_apply_overlap_with_overlap_case_2(): splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=6, separators=["."]) chunks = ["chunk1", "chunk2", "chunk3", "chunk4"] result = splitter._apply_overlap(chunks) @@ -126,14 +126,178 @@ def test_recursive_splitter_using_custom_sentence_tokenizer(): ) # noqa: E501 +def test_run_split_by_dot_count_page_breaks() -> None: + document_splitter = RecursiveDocumentSplitter(separators=["."], split_length=30, split_overlap=0) + + text = ( + "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" + "Sentence on page 3. Another on page 3.\f\f Sentence on page 5." + ) + + documents = document_splitter.run(documents=[Document(content=text)])["documents"] + + assert len(documents) == 7 + assert documents[0].content == "Sentence on page 1." + assert documents[0].meta["page_number"] == 1 + assert documents[0].meta["split_id"] == 0 + assert documents[0].meta["split_idx_start"] == text.index(documents[0].content) + + assert documents[1].content == " Another on page 1." + assert documents[1].meta["page_number"] == 1 + assert documents[1].meta["split_id"] == 1 + assert documents[1].meta["split_idx_start"] == text.index(documents[1].content) + + assert documents[2].content == "\fSentence on page 2." + assert documents[2].meta["page_number"] == 2 + assert documents[2].meta["split_id"] == 2 + assert documents[2].meta["split_idx_start"] == text.index(documents[2].content) + + assert documents[3].content == " Another on page 2." + assert documents[3].meta["page_number"] == 2 + assert documents[3].meta["split_id"] == 3 + assert documents[3].meta["split_idx_start"] == text.index(documents[3].content) + + assert documents[4].content == "\fSentence on page 3." + assert documents[4].meta["page_number"] == 3 + assert documents[4].meta["split_id"] == 4 + assert documents[4].meta["split_idx_start"] == text.index(documents[4].content) + + assert documents[5].content == " Another on page 3." + assert documents[5].meta["page_number"] == 3 + assert documents[5].meta["split_id"] == 5 + assert documents[5].meta["split_idx_start"] == text.index(documents[5].content) + + assert documents[6].content == "\f\f Sentence on page 5." + assert documents[6].meta["page_number"] == 5 + assert documents[6].meta["split_id"] == 6 + assert documents[6].meta["split_idx_start"] == text.index(documents[6].content) + + +def test_run_split_by_word_count_page_breaks(): + splitter = RecursiveDocumentSplitter(split_length=18, split_overlap=0, separators=["w"]) + text = "This is some text. \f This text is on another page. \f This is the last pag3." + doc = Document(content=text) + doc_chunks = splitter.run([doc]) + doc_chunks = doc_chunks["documents"] + + assert len(doc_chunks) == 5 + assert doc_chunks[0].content == "This is some text." + assert doc_chunks[0].meta["page_number"] == 1 + assert doc_chunks[0].meta["split_id"] == 0 + assert doc_chunks[0].meta["split_idx_start"] == text.index(doc_chunks[0].content) + + assert doc_chunks[1].content == " \f This text is on" + assert doc_chunks[1].meta["page_number"] == 2 + assert doc_chunks[1].meta["split_id"] == 1 + assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content) + + assert doc_chunks[2].content == " another page. \f T" + assert doc_chunks[2].meta["page_number"] == 3 + assert doc_chunks[2].meta["split_id"] == 2 + assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content) + + assert doc_chunks[3].content == "his is the last pa" + assert doc_chunks[3].meta["page_number"] == 3 + assert doc_chunks[3].meta["split_id"] == 3 + assert doc_chunks[3].meta["split_idx_start"] == text.index(doc_chunks[3].content) + + assert doc_chunks[4].content == "g3." + assert doc_chunks[4].meta["page_number"] == 3 + assert doc_chunks[4].meta["split_id"] == 4 + assert doc_chunks[4].meta["split_idx_start"] == text.index(doc_chunks[4].content) + + +# ToDo: seems the 'sentence' separator eliminates the page breaks - investigate this further +def ignore_test_run_split_by_sentence_count_page_breaks() -> None: + document_splitter = RecursiveDocumentSplitter(separators=["sentence"], split_length=20, split_overlap=0) + + text = ( + "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" + "Sentence on page 3. Another on page 3.\f\f Sentence on page 5." + ) + + documents = document_splitter.run(documents=[Document(content=text)]) + chunks_docs = documents["documents"] + assert len(chunks_docs) == 7 + assert chunks_docs[0].content == "Sentence on page 1." + assert chunks_docs[0].meta["page_number"] == 1 + assert chunks_docs[0].meta["split_id"] == 0 + assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content) + + assert chunks_docs[1].content == " Another on page 1." + assert chunks_docs[1].meta["page_number"] == 1 + assert chunks_docs[1].meta["split_id"] == 1 + assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content) + + assert chunks_docs[2].content == "\fSentence on page 2." + assert chunks_docs[2].meta["page_number"] == 2 + assert chunks_docs[2].meta["split_id"] == 2 + assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content) + + assert chunks_docs[3].content == " Another on page 2." + assert chunks_docs[3].meta["page_number"] == 2 + assert chunks_docs[3].meta["split_id"] == 3 + assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content) + + assert chunks_docs[4].content == "\fSentence on page 3." + assert chunks_docs[4].meta["page_number"] == 3 + assert chunks_docs[4].meta["split_id"] == 4 + assert chunks_docs[4].meta["split_idx_start"] == text.index(chunks_docs[4].content) + + assert chunks_docs[5].content == " Another on page 3." + assert chunks_docs[5].meta["page_number"] == 3 + assert chunks_docs[5].meta["split_id"] == 5 + assert chunks_docs[5].meta["split_idx_start"] == text.index(chunks_docs[5].content) + + assert chunks_docs[6].content == "\f\f Sentence on page 5." + assert chunks_docs[6].meta["page_number"] == 5 + assert chunks_docs[6].meta["split_id"] == 6 + assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content) + + +def test_run_split_by_page_break_count_page_breaks() -> None: + document_splitter = RecursiveDocumentSplitter(separators=["\f"], split_length=50, split_overlap=0) + + text = ( + "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" + "Sentence on page 3. Another on page 3.\f\f Sentence on page 5." + ) + + documents = document_splitter.run(documents=[Document(content=text)]) + chunks_docs = documents["documents"] + assert len(chunks_docs) == 4 + assert chunks_docs[0].content == "Sentence on page 1. Another on page 1.\f" + assert chunks_docs[0].meta["page_number"] == 1 + assert chunks_docs[0].meta["split_id"] == 0 + assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content) + + assert chunks_docs[1].content == "Sentence on page 2. Another on page 2.\f" + assert chunks_docs[1].meta["page_number"] == 2 + assert chunks_docs[1].meta["split_id"] == 1 + assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content) + + assert chunks_docs[2].content == "Sentence on page 3. Another on page 3.\f\f" + assert chunks_docs[2].meta["page_number"] == 3 + assert chunks_docs[2].meta["split_id"] == 2 + assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content) + + assert chunks_docs[3].content == " Sentence on page 5." + assert chunks_docs[3].meta["page_number"] == 5 + assert chunks_docs[3].meta["split_id"] == 3 + assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content) + + +def test_run_split_by_new_line_count_page_breaks() -> None: + # ToDo + pass + + def test_recursive_splitter_custom_sentence_tokenizer_document_and_overlap(): - splitter = RecursiveDocumentSplitter(split_length=40, split_overlap=10, separators=["sentence"]) - text = "I must not fear. Fear is the mind-killer. Fear is the little-death that brings total obliteration. I will face my fear. I will permit it to pass over me and through me. And when it has gone past I will turn the inner eye to see its path. Where the fear has gone there will be nothing. Only I will remain." - chunks = splitter.run([Document(content=text)]) # ToDo + pass -def test_recursive_chunker_split_document_with_overlap(): +def test_run_split_document_with_overlap(): splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=11, separators=[".", " "]) text = """A simple sentence1. A bright sentence2. A clever sentence3. A joyful sentence4""" @@ -170,22 +334,7 @@ def test_recursive_chunker_split_document_with_overlap(): assert doc_chunks[3].meta["_split_overlap"] == [{"doc_id": doc_chunks[2].id, "range": (20, 31)}] -def test_recursive_splitter_generate_pages(): - splitter = RecursiveDocumentSplitter(split_length=18, split_overlap=0, separators=[" "]) - doc = Document(content="This is some text. \f This text is on another page. \f This is the last page.") - doc_chunks = splitter.run([doc]) - doc_chunks = doc_chunks["documents"] - assert len(doc_chunks) == 7 - for doc in doc_chunks: - if doc.meta["split_id"] in [0, 1, 2]: - assert doc.meta["page_number"] == 1 - if doc.meta["split_id"] in [3, 4]: - assert doc.meta["page_number"] == 2 - if doc.meta["split_id"] in [5, 6]: - assert doc.meta["page_number"] == 3 - - -def test_recursive_splitter_separator_exists_but_split_length_too_small_fall_back_to_character_chunking(): +def test_run_separator_exists_but_split_length_too_small_fall_back_to_character_chunking(): splitter = RecursiveDocumentSplitter(separators=[" "], split_length=2) doc = Document(content="This is some text. This is some more text.") result = splitter.run(documents=[doc]) @@ -194,7 +343,7 @@ def test_recursive_splitter_separator_exists_but_split_length_too_small_fall_bac assert len(doc.content) == 2 -def test_recursive_splitter_fallback_to_character_chunking(): +def test_run_fallback_to_character_chunking(): text = "abczdefzghizjkl" separators = ["\n\n", "\n", "z"] splitter = RecursiveDocumentSplitter(split_length=2, separators=separators) @@ -204,7 +353,7 @@ def test_recursive_splitter_fallback_to_character_chunking(): assert len(chunk.content) <= 2 -def test_recursive_splitter_serialization_in_pipeline(): +def test_run_serialization_in_pipeline(): pipeline = Pipeline() pipeline.add_component("chunker", RecursiveDocumentSplitter(split_length=20, split_overlap=5, separators=["."])) pipeline_dict = pipeline.dumps() From 13f85e15c33966a0ae6e7a96bacd1134e467728e Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 16 Dec 2024 10:48:19 +0100 Subject: [PATCH 40/82] wip --- .../preprocessors/recursive_splitter.py | 21 ++- .../preprocessors/test_recursive_splitter.py | 128 ++++++++++++++---- 2 files changed, 109 insertions(+), 40 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 404a81e5c8..78c7133a2b 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -213,18 +213,15 @@ def _run_one(self, doc: Document) -> List[Document]: } ) - # for the case where a chunk ends with one or multiple consecutive page breaks - # page_breaks_at_end = 0 - # for i in range(1, len(chunk) + 1): - # if ord(chunk[-i]) == 12: # ASCII value for form feed, which is used as a page break - # page_breaks_at_end += 1 - # if page_breaks_at_end > 0 and current_page > 1: - # new_doc.meta["page_number"] = current_page - page_breaks_at_end - # else: - # new_doc.meta["page_number"] = current_page - - new_doc.meta["page_number"] = current_page - current_page += chunk.count("\f") # count page breaks in the chunk + # count page breaks in the chunk + current_page += chunk.count("\f") + # count the number of consecutive page breaks at the end of the chunk + consecutive_page_breaks = len(chunk) - len(chunk.rstrip("\f")) + + if consecutive_page_breaks > 0: + new_doc.meta["page_number"] = current_page - consecutive_page_breaks + else: + new_doc.meta["page_number"] = current_page # keep the new chunk doc and update the current position new_docs.append(new_doc) diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index a37eee83a2..87f4124353 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -207,56 +207,91 @@ def test_run_split_by_word_count_page_breaks(): assert doc_chunks[4].meta["split_idx_start"] == text.index(doc_chunks[4].content) -# ToDo: seems the 'sentence' separator eliminates the page breaks - investigate this further -def ignore_test_run_split_by_sentence_count_page_breaks() -> None: - document_splitter = RecursiveDocumentSplitter(separators=["sentence"], split_length=20, split_overlap=0) +def test_run_split_by_page_break_count_page_breaks() -> None: + document_splitter = RecursiveDocumentSplitter(separators=["\f"], split_length=50, split_overlap=0) text = ( "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" "Sentence on page 3. Another on page 3.\f\f Sentence on page 5." ) + documents = document_splitter.run(documents=[Document(content=text)]) + chunks_docs = documents["documents"] + assert len(chunks_docs) == 4 + assert chunks_docs[0].content == "Sentence on page 1. Another on page 1.\f" + assert chunks_docs[0].meta["page_number"] == 1 + assert chunks_docs[0].meta["split_id"] == 0 + assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content) + + assert chunks_docs[1].content == "Sentence on page 2. Another on page 2.\f" + assert chunks_docs[1].meta["page_number"] == 2 + assert chunks_docs[1].meta["split_id"] == 1 + assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content) + + assert chunks_docs[2].content == "Sentence on page 3. Another on page 3.\f\f" + assert chunks_docs[2].meta["page_number"] == 3 + assert chunks_docs[2].meta["split_id"] == 2 + assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content) + + assert chunks_docs[3].content == " Sentence on page 5." + assert chunks_docs[3].meta["page_number"] == 5 + assert chunks_docs[3].meta["split_id"] == 3 + assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content) + + +def test_run_split_by_new_line_count_page_breaks() -> None: + document_splitter = RecursiveDocumentSplitter(separators=["\n"], split_length=50, split_overlap=0) + + text = ( + "Sentence on page 1.\nAnother on page 1.\f" + "Sentence on page 2.\nAnother on page 2.\f" + "Sentence on page 3.\nAnother on page 3.\f\f" + "Sentence on page 5." + ) + documents = document_splitter.run(documents=[Document(content=text)]) chunks_docs = documents["documents"] assert len(chunks_docs) == 7 - assert chunks_docs[0].content == "Sentence on page 1." + + assert chunks_docs[0].content == "Sentence on page 1.\n" assert chunks_docs[0].meta["page_number"] == 1 assert chunks_docs[0].meta["split_id"] == 0 assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content) - assert chunks_docs[1].content == " Another on page 1." + assert chunks_docs[1].content == "Another on page 1.\f" assert chunks_docs[1].meta["page_number"] == 1 assert chunks_docs[1].meta["split_id"] == 1 assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content) - assert chunks_docs[2].content == "\fSentence on page 2." + assert chunks_docs[2].content == "Sentence on page 2.\n" assert chunks_docs[2].meta["page_number"] == 2 assert chunks_docs[2].meta["split_id"] == 2 assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content) - assert chunks_docs[3].content == " Another on page 2." + assert chunks_docs[3].content == "Another on page 2.\f" assert chunks_docs[3].meta["page_number"] == 2 assert chunks_docs[3].meta["split_id"] == 3 assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content) - assert chunks_docs[4].content == "\fSentence on page 3." + assert chunks_docs[4].content == "Sentence on page 3.\n" assert chunks_docs[4].meta["page_number"] == 3 assert chunks_docs[4].meta["split_id"] == 4 assert chunks_docs[4].meta["split_idx_start"] == text.index(chunks_docs[4].content) - assert chunks_docs[5].content == " Another on page 3." + assert chunks_docs[5].content == "Another on page 3.\f\f" assert chunks_docs[5].meta["page_number"] == 3 assert chunks_docs[5].meta["split_id"] == 5 assert chunks_docs[5].meta["split_idx_start"] == text.index(chunks_docs[5].content) - assert chunks_docs[6].content == "\f\f Sentence on page 5." + assert chunks_docs[6].content == "Sentence on page 5." assert chunks_docs[6].meta["page_number"] == 5 assert chunks_docs[6].meta["split_id"] == 6 assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content) -def test_run_split_by_page_break_count_page_breaks() -> None: - document_splitter = RecursiveDocumentSplitter(separators=["\f"], split_length=50, split_overlap=0) +# ToDo: seems the 'sentence' separator eliminates the page breaks - investigate this further +def ignore_test_run_split_by_sentence_count_page_breaks() -> None: + document_splitter = RecursiveDocumentSplitter(separators=["sentence"], split_length=50, split_overlap=0) text = ( "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" @@ -265,36 +300,73 @@ def test_run_split_by_page_break_count_page_breaks() -> None: documents = document_splitter.run(documents=[Document(content=text)]) chunks_docs = documents["documents"] - assert len(chunks_docs) == 4 - assert chunks_docs[0].content == "Sentence on page 1. Another on page 1.\f" + assert len(chunks_docs) == 5 + assert chunks_docs[0].content == "Sentence on page 1. Another on page 1." assert chunks_docs[0].meta["page_number"] == 1 assert chunks_docs[0].meta["split_id"] == 0 assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content) - assert chunks_docs[1].content == "Sentence on page 2. Another on page 2.\f" - assert chunks_docs[1].meta["page_number"] == 2 - assert chunks_docs[1].meta["split_id"] == 1 - assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content) - - assert chunks_docs[2].content == "Sentence on page 3. Another on page 3.\f\f" - assert chunks_docs[2].meta["page_number"] == 3 + assert chunks_docs[2].content == "\fSentence on page 2." + assert chunks_docs[2].meta["page_number"] == 2 assert chunks_docs[2].meta["split_id"] == 2 assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content) - assert chunks_docs[3].content == " Sentence on page 5." - assert chunks_docs[3].meta["page_number"] == 5 + assert chunks_docs[3].content == " Another on page 2." + assert chunks_docs[3].meta["page_number"] == 2 assert chunks_docs[3].meta["split_id"] == 3 assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content) + assert chunks_docs[4].content == "\fSentence on page 3." + assert chunks_docs[4].meta["page_number"] == 3 + assert chunks_docs[4].meta["split_id"] == 4 + assert chunks_docs[4].meta["split_idx_start"] == text.index(chunks_docs[4].content) -def test_run_split_by_new_line_count_page_breaks() -> None: - # ToDo - pass + assert chunks_docs[5].content == " Another on page 3." + assert chunks_docs[5].meta["page_number"] == 3 + assert chunks_docs[5].meta["split_id"] == 5 + assert chunks_docs[5].meta["split_idx_start"] == text.index(chunks_docs[5].content) + + assert chunks_docs[6].content == "\f\f Sentence on page 5." + assert chunks_docs[6].meta["page_number"] == 5 + assert chunks_docs[6].meta["split_id"] == 6 + assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content) def test_recursive_splitter_custom_sentence_tokenizer_document_and_overlap(): - # ToDo - pass + """Test that RecursiveDocumentSplitter works correctly with custom sentence tokenizer and overlap""" + splitter = RecursiveDocumentSplitter(split_length=25, split_overlap=5, separators=["sentence"]) + text = "This is sentence one. This is sentence two. This is sentence three." + + # ToDo: + # BUG: the sentence tokenizer is not working correctly, it eliminates the space between the sentences + + doc = Document(content=text) + doc_chunks = splitter.run([doc])["documents"] + + assert len(doc_chunks) == 3 + # for i, chunk in enumerate(doc_chunks): + # print(chunk.id) + # print(chunk.content) + # print(chunk.meta) + # print("\n---") + + # assert doc_chunks[0].content == "This is sentence one." + # assert doc_chunks[0].meta["split_id"] == 0 + # assert doc_chunks[0].meta["split_idx_start"] == 0 + # assert doc_chunks[0].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (0, 5)}] + # + # assert doc_chunks[1].content == " one.This is sentence two." + # assert doc_chunks[1].meta["split_id"] == 1 + # assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content) + # assert doc_chunks[1].meta["_split_overlap"] == [ + # {"doc_id": doc_chunks[0].id, "range": (15, 20)}, + # {"doc_id": doc_chunks[2].id, "range": (0, 5)} + # ] + # + # assert doc_chunks[2].content == " two.This is sentence three." + # assert doc_chunks[2].meta["split_id"] == 2 + # assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content) + # assert doc_chunks[2].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (16, 21)}] def test_run_split_document_with_overlap(): From eebe1a088a85e2b5ab6da5ebe4700bf4ef8fc9ae Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 16 Dec 2024 11:39:05 +0100 Subject: [PATCH 41/82] wip; fixed bug with sentence tokenizer, needs to keep white spaces --- .../preprocessors/recursive_splitter.py | 2 +- .../preprocessors/test_recursive_splitter.py | 97 +++++++++---------- 2 files changed, 46 insertions(+), 53 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 78c7133a2b..eea94dfd8f 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -90,7 +90,7 @@ def _get_custom_sentence_tokenizer(): from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter except (LookupError, ModuleNotFoundError): raise Exception("You need to install NLTK to use this function. You can install it via `pip install nltk`") - return SentenceSplitter() + return SentenceSplitter(keep_white_spaces=True) def _apply_overlap(self, chunks: List[str]) -> List[str]: """ diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 87f4124353..8632954b34 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -289,8 +289,7 @@ def test_run_split_by_new_line_count_page_breaks() -> None: assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content) -# ToDo: seems the 'sentence' separator eliminates the page breaks - investigate this further -def ignore_test_run_split_by_sentence_count_page_breaks() -> None: +def test_run_split_by_sentence_count_page_breaks() -> None: document_splitter = RecursiveDocumentSplitter(separators=["sentence"], split_length=50, split_overlap=0) text = ( @@ -301,35 +300,37 @@ def ignore_test_run_split_by_sentence_count_page_breaks() -> None: documents = document_splitter.run(documents=[Document(content=text)]) chunks_docs = documents["documents"] assert len(chunks_docs) == 5 - assert chunks_docs[0].content == "Sentence on page 1. Another on page 1." + + print("\n-----------") + for chunk in chunks_docs: + print(chunk.content) + print(chunk.meta) + print("\n-----------") + + assert chunks_docs[0].content == "Sentence on page 1. Another on page 1.\f" assert chunks_docs[0].meta["page_number"] == 1 assert chunks_docs[0].meta["split_id"] == 0 assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content) - assert chunks_docs[2].content == "\fSentence on page 2." - assert chunks_docs[2].meta["page_number"] == 2 - assert chunks_docs[2].meta["split_id"] == 2 - assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content) - - assert chunks_docs[3].content == " Another on page 2." - assert chunks_docs[3].meta["page_number"] == 2 - assert chunks_docs[3].meta["split_id"] == 3 - assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content) - - assert chunks_docs[4].content == "\fSentence on page 3." - assert chunks_docs[4].meta["page_number"] == 3 - assert chunks_docs[4].meta["split_id"] == 4 - assert chunks_docs[4].meta["split_idx_start"] == text.index(chunks_docs[4].content) - - assert chunks_docs[5].content == " Another on page 3." - assert chunks_docs[5].meta["page_number"] == 3 - assert chunks_docs[5].meta["split_id"] == 5 - assert chunks_docs[5].meta["split_idx_start"] == text.index(chunks_docs[5].content) + assert chunks_docs[1].content == "Sentence on page 2. " + assert chunks_docs[1].meta["page_number"] == 2 + assert chunks_docs[1].meta["split_id"] == 1 + assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content) - assert chunks_docs[6].content == "\f\f Sentence on page 5." - assert chunks_docs[6].meta["page_number"] == 5 - assert chunks_docs[6].meta["split_id"] == 6 - assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content) + # assert chunks_docs[2].content == "\fSentence on page 3. Another on page 3.\f" + assert chunks_docs[2].meta["page_number"] == 3 + assert chunks_docs[2].meta["split_id"] == 3 + assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[4].content) + # + # assert chunks_docs[5].content == " Another on page 3." + # assert chunks_docs[5].meta["page_number"] == 3 + # assert chunks_docs[5].meta["split_id"] == 4 + # assert chunks_docs[5].meta["split_idx_start"] == text.index(chunks_docs[5].content) + # + # assert chunks_docs[6].content == "\f\f Sentence on page 5." + # assert chunks_docs[6].meta["page_number"] == 5 + # assert chunks_docs[6].meta["split_id"] == 5 + # assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content) def test_recursive_splitter_custom_sentence_tokenizer_document_and_overlap(): @@ -337,36 +338,28 @@ def test_recursive_splitter_custom_sentence_tokenizer_document_and_overlap(): splitter = RecursiveDocumentSplitter(split_length=25, split_overlap=5, separators=["sentence"]) text = "This is sentence one. This is sentence two. This is sentence three." - # ToDo: - # BUG: the sentence tokenizer is not working correctly, it eliminates the space between the sentences - doc = Document(content=text) doc_chunks = splitter.run([doc])["documents"] assert len(doc_chunks) == 3 - # for i, chunk in enumerate(doc_chunks): - # print(chunk.id) - # print(chunk.content) - # print(chunk.meta) - # print("\n---") - - # assert doc_chunks[0].content == "This is sentence one." - # assert doc_chunks[0].meta["split_id"] == 0 - # assert doc_chunks[0].meta["split_idx_start"] == 0 - # assert doc_chunks[0].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (0, 5)}] - # - # assert doc_chunks[1].content == " one.This is sentence two." - # assert doc_chunks[1].meta["split_id"] == 1 - # assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content) - # assert doc_chunks[1].meta["_split_overlap"] == [ - # {"doc_id": doc_chunks[0].id, "range": (15, 20)}, - # {"doc_id": doc_chunks[2].id, "range": (0, 5)} - # ] - # - # assert doc_chunks[2].content == " two.This is sentence three." - # assert doc_chunks[2].meta["split_id"] == 2 - # assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content) - # assert doc_chunks[2].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (16, 21)}] + + assert doc_chunks[0].content == "This is sentence one. " + assert doc_chunks[0].meta["split_id"] == 0 + assert doc_chunks[0].meta["split_idx_start"] == text.index(doc_chunks[0].content) + assert doc_chunks[0].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (0, 5)}] + + assert doc_chunks[1].content == "one. This is sentence two. " + assert doc_chunks[1].meta["split_id"] == 1 + assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content) + assert doc_chunks[1].meta["_split_overlap"] == [ + {"doc_id": doc_chunks[0].id, "range": (17, 22)}, + {"doc_id": doc_chunks[2].id, "range": (0, 5)}, + ] + + assert doc_chunks[2].content == "two. This is sentence three." + assert doc_chunks[2].meta["split_id"] == 2 + assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content) + assert doc_chunks[2].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (22, 27)}] def test_run_split_document_with_overlap(): From 3f00b3b70f1ce0f9edd9d47352ca824c606c8719 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 16 Dec 2024 15:20:01 +0100 Subject: [PATCH 42/82] adding tests for counting pages on different split approaches --- .../preprocessors/recursive_splitter.py | 18 ++-- .../preprocessors/test_recursive_splitter.py | 87 +++++++++++-------- 2 files changed, 62 insertions(+), 43 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index eea94dfd8f..dd075f309c 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -52,7 +52,11 @@ class RecursiveDocumentSplitter: """ # noqa: E501 def __init__( # pylint: disable=too-many-positional-arguments - self, split_length: int = 200, split_overlap: int = 0, separators: Optional[List[str]] = None + self, + split_length: int = 200, + split_overlap: int = 0, + separators: Optional[List[str]] = None, + sentence_splitter_params: Optional[Dict[str, str]] = None, ): """ Initializes a RecursiveDocumentSplitter. @@ -70,9 +74,11 @@ def __init__( # pylint: disable=too-many-positional-arguments self.split_length = split_length self.split_overlap = split_overlap self.separators = separators if separators else ["\n\n", "sentence", "\n", " "] # default separators + self.sentence_tokenizer_params = sentence_splitter_params self._check_params() if "sentence" in self.separators: - self.nltk_tokenizer = self._get_custom_sentence_tokenizer() + sentence_splitter_params = sentence_splitter_params or {"keep_white_spaces": True} + self.nltk_tokenizer = self._get_custom_sentence_tokenizer(sentence_splitter_params) def _check_params(self): if self.split_length < 1: @@ -85,12 +91,12 @@ def _check_params(self): raise ValueError("All separators must be strings.") @staticmethod - def _get_custom_sentence_tokenizer(): + def _get_custom_sentence_tokenizer(sentence_splitter_params: Optional[Dict[str, str]] = None): try: from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter except (LookupError, ModuleNotFoundError): raise Exception("You need to install NLTK to use this function. You can install it via `pip install nltk`") - return SentenceSplitter(keep_white_spaces=True) + return SentenceSplitter(**sentence_splitter_params) def _apply_overlap(self, chunks: List[str]) -> List[str]: """ @@ -215,7 +221,9 @@ def _run_one(self, doc: Document) -> List[Document]: # count page breaks in the chunk current_page += chunk.count("\f") - # count the number of consecutive page breaks at the end of the chunk + + # if there are consecutive page breaks at the end with no more text, adjust the page number + # e.g: "text\f\f\f" -> 3 page breaks, but current_page should be 1 consecutive_page_breaks = len(chunk) - len(chunk.rstrip("\f")) if consecutive_page_breaks > 0: diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 8632954b34..71e189e30e 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -7,7 +7,7 @@ def test_get_custom_sentence_tokenizer_success(): - tokenizer = RecursiveDocumentSplitter._get_custom_sentence_tokenizer() + tokenizer = RecursiveDocumentSplitter._get_custom_sentence_tokenizer({}) assert isinstance(tokenizer, SentenceSplitter) @@ -105,7 +105,12 @@ def test_recursive_splitter_using_custom_sentence_tokenizer(): This test includes abbreviations that are not handled by the simple sentence tokenizer based on "." and requires a more sophisticated sentence tokenizer like the one provided by NLTK. """ - splitter = RecursiveDocumentSplitter(split_length=400, split_overlap=0, separators=["\n\n", "\n", "sentence", " "]) + splitter = RecursiveDocumentSplitter( + split_length=400, + split_overlap=0, + separators=["\n\n", "\n", "sentence", " "], + sentence_splitter_params={"language": "en", "use_split_rules": True, "keep_white_spaces": False}, + ) text = """Artificial intelligence (AI) - Introduction AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. @@ -113,6 +118,7 @@ def test_recursive_splitter_using_custom_sentence_tokenizer(): chunks = splitter.run([Document(content=text)]) chunks = chunks["documents"] + assert len(chunks) == 4 assert chunks[0].content == "Artificial intelligence (AI) - Introduction\n\n" assert ( @@ -240,17 +246,18 @@ def test_run_split_by_page_break_count_page_breaks() -> None: def test_run_split_by_new_line_count_page_breaks() -> None: - document_splitter = RecursiveDocumentSplitter(separators=["\n"], split_length=50, split_overlap=0) + document_splitter = RecursiveDocumentSplitter(separators=["\n"], split_length=21, split_overlap=0) text = ( - "Sentence on page 1.\nAnother on page 1.\f" - "Sentence on page 2.\nAnother on page 2.\f" - "Sentence on page 3.\nAnother on page 3.\f\f" + "Sentence on page 1.\nAnother on page 1.\n\f" + "Sentence on page 2.\nAnother on page 2.\n\f" + "Sentence on page 3.\nAnother on page 3.\n\f\f" "Sentence on page 5." ) documents = document_splitter.run(documents=[Document(content=text)]) chunks_docs = documents["documents"] + assert len(chunks_docs) == 7 assert chunks_docs[0].content == "Sentence on page 1.\n" @@ -258,79 +265,83 @@ def test_run_split_by_new_line_count_page_breaks() -> None: assert chunks_docs[0].meta["split_id"] == 0 assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content) - assert chunks_docs[1].content == "Another on page 1.\f" + assert chunks_docs[1].content == "Another on page 1.\n" assert chunks_docs[1].meta["page_number"] == 1 assert chunks_docs[1].meta["split_id"] == 1 assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content) - assert chunks_docs[2].content == "Sentence on page 2.\n" + assert chunks_docs[2].content == "\fSentence on page 2.\n" assert chunks_docs[2].meta["page_number"] == 2 assert chunks_docs[2].meta["split_id"] == 2 assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content) - assert chunks_docs[3].content == "Another on page 2.\f" + assert chunks_docs[3].content == "Another on page 2.\n" assert chunks_docs[3].meta["page_number"] == 2 assert chunks_docs[3].meta["split_id"] == 3 assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content) - assert chunks_docs[4].content == "Sentence on page 3.\n" + assert chunks_docs[4].content == "\fSentence on page 3.\n" assert chunks_docs[4].meta["page_number"] == 3 assert chunks_docs[4].meta["split_id"] == 4 assert chunks_docs[4].meta["split_idx_start"] == text.index(chunks_docs[4].content) - assert chunks_docs[5].content == "Another on page 3.\f\f" + assert chunks_docs[5].content == "Another on page 3.\n" assert chunks_docs[5].meta["page_number"] == 3 assert chunks_docs[5].meta["split_id"] == 5 assert chunks_docs[5].meta["split_idx_start"] == text.index(chunks_docs[5].content) - assert chunks_docs[6].content == "Sentence on page 5." + assert chunks_docs[6].content == "\f\fSentence on page 5." assert chunks_docs[6].meta["page_number"] == 5 assert chunks_docs[6].meta["split_id"] == 6 assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content) def test_run_split_by_sentence_count_page_breaks() -> None: - document_splitter = RecursiveDocumentSplitter(separators=["sentence"], split_length=50, split_overlap=0) + document_splitter = RecursiveDocumentSplitter(separators=["sentence"], split_length=28, split_overlap=0) text = ( "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" - "Sentence on page 3. Another on page 3.\f\f Sentence on page 5." + "Sentence on page 3. Another on page 3.\f\fSentence on page 5." ) documents = document_splitter.run(documents=[Document(content=text)]) chunks_docs = documents["documents"] - assert len(chunks_docs) == 5 - - print("\n-----------") - for chunk in chunks_docs: - print(chunk.content) - print(chunk.meta) - print("\n-----------") + assert len(chunks_docs) == 7 - assert chunks_docs[0].content == "Sentence on page 1. Another on page 1.\f" + assert chunks_docs[0].content == "Sentence on page 1. " assert chunks_docs[0].meta["page_number"] == 1 assert chunks_docs[0].meta["split_id"] == 0 assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content) - assert chunks_docs[1].content == "Sentence on page 2. " - assert chunks_docs[1].meta["page_number"] == 2 + assert chunks_docs[1].content == "Another on page 1.\f" + assert chunks_docs[1].meta["page_number"] == 1 assert chunks_docs[1].meta["split_id"] == 1 assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content) - # assert chunks_docs[2].content == "\fSentence on page 3. Another on page 3.\f" - assert chunks_docs[2].meta["page_number"] == 3 - assert chunks_docs[2].meta["split_id"] == 3 - assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[4].content) - # - # assert chunks_docs[5].content == " Another on page 3." - # assert chunks_docs[5].meta["page_number"] == 3 - # assert chunks_docs[5].meta["split_id"] == 4 - # assert chunks_docs[5].meta["split_idx_start"] == text.index(chunks_docs[5].content) - # - # assert chunks_docs[6].content == "\f\f Sentence on page 5." - # assert chunks_docs[6].meta["page_number"] == 5 - # assert chunks_docs[6].meta["split_id"] == 5 - # assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content) + assert chunks_docs[2].content == "Sentence on page 2. " + assert chunks_docs[2].meta["page_number"] == 2 + assert chunks_docs[2].meta["split_id"] == 2 + assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content) + + assert chunks_docs[3].content == "Another on page 2.\f" + assert chunks_docs[3].meta["page_number"] == 2 + assert chunks_docs[3].meta["split_id"] == 3 + assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content) + + assert chunks_docs[4].content == "Sentence on page 3. " + assert chunks_docs[4].meta["page_number"] == 3 + assert chunks_docs[4].meta["split_id"] == 4 + assert chunks_docs[4].meta["split_idx_start"] == text.index(chunks_docs[4].content) + + assert chunks_docs[5].content == "Another on page 3.\f\f" + assert chunks_docs[5].meta["page_number"] == 3 + assert chunks_docs[5].meta["split_id"] == 5 + assert chunks_docs[5].meta["split_idx_start"] == text.index(chunks_docs[5].content) + + assert chunks_docs[6].content == "Sentence on page 5." + assert chunks_docs[6].meta["page_number"] == 5 + assert chunks_docs[6].meta["split_id"] == 6 + assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content) def test_recursive_splitter_custom_sentence_tokenizer_document_and_overlap(): From d9addfaa71ce619325fea33a1d5676a76ed74d43 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 16 Dec 2024 15:38:52 +0100 Subject: [PATCH 43/82] NLTK checks done on SentenceSplitter --- haystack/components/preprocessors/recursive_splitter.py | 6 ++---- haystack/components/preprocessors/sentence_tokenizer.py | 1 + 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index dd075f309c..dbf2b5f02f 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -92,10 +92,8 @@ def _check_params(self): @staticmethod def _get_custom_sentence_tokenizer(sentence_splitter_params: Optional[Dict[str, str]] = None): - try: - from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter - except (LookupError, ModuleNotFoundError): - raise Exception("You need to install NLTK to use this function. You can install it via `pip install nltk`") + from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter + return SentenceSplitter(**sentence_splitter_params) def _apply_overlap(self, chunks: List[str]) -> List[str]: diff --git a/haystack/components/preprocessors/sentence_tokenizer.py b/haystack/components/preprocessors/sentence_tokenizer.py index 505126e901..23ada4770d 100644 --- a/haystack/components/preprocessors/sentence_tokenizer.py +++ b/haystack/components/preprocessors/sentence_tokenizer.py @@ -135,6 +135,7 @@ def __init__( Currently supported languages are: en, de. :param keep_white_spaces: If True, the tokenizer will keep white spaces between sentences. """ + nltk_imports.check() self.language = language self.sentence_tokenizer = load_sentence_tokenizer(language, keep_white_spaces=keep_white_spaces) self.use_split_rules = use_split_rules From c3f09d0359fa287420bc65a43772f8777e1801bd Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 16 Dec 2024 15:57:24 +0100 Subject: [PATCH 44/82] fixing types --- haystack/components/preprocessors/recursive_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index dbf2b5f02f..3cb31a6893 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -91,7 +91,7 @@ def _check_params(self): raise ValueError("All separators must be strings.") @staticmethod - def _get_custom_sentence_tokenizer(sentence_splitter_params: Optional[Dict[str, str]] = None): + def _get_custom_sentence_tokenizer(sentence_splitter_params: Dict[str, str]): from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter return SentenceSplitter(**sentence_splitter_params) From 2df40c338cd46146821af36437f5ffec67dec04c Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 16 Dec 2024 16:51:31 +0100 Subject: [PATCH 45/82] adding detecting for full overlap with previous chunks --- .../components/preprocessors/recursive_splitter.py | 10 +++++++++- .../preprocessors/test_recursive_splitter.py | 12 +++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 3cb31a6893..fc421d552f 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -105,13 +105,21 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: The list of chunks with overlap applied. """ overlapped_chunks = [] + for idx, chunk in enumerate(chunks): if idx == 0: overlapped_chunks.append(chunk) continue overlap_start = max(0, len(chunks[idx - 1]) - self.split_overlap) - current_chunk = chunks[idx - 1][overlap_start:] + chunk + overlap = chunks[idx - 1][overlap_start:] + if overlap == chunks[idx - 1]: + logger.warning( + "Overlap is the same as the previous chunk. " + "Consider increasing the `split_overlap` parameter or decreasing the `split_length` parameter." + ) + current_chunk = overlap + chunk overlapped_chunks.append(current_chunk) + return overlapped_chunks def _chunk_text(self, text: str) -> List[str]: diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 71e189e30e..f7f0bf1467 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -39,7 +39,7 @@ def test_apply_overlap_no_overlap(): assert result == ["chunk1", "chunk2", "chunk3"] -def test_apply_overlap_with_overlap_case_1(): +def test_apply_overlap_with_overlap(): # Test the case where there is overlap between chunks splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=4, separators=["."]) chunks = ["chunk1", "chunk2", "chunk3"] @@ -47,12 +47,14 @@ def test_apply_overlap_with_overlap_case_1(): assert result == ["chunk1", "unk1chunk2", "unk2chunk3"] -# ToDo: update this test, result above is not the expected one -def ignore_test_apply_overlap_with_overlap_case_2(): +def test_apply_overlap_with_overlap_capturing_completely_previous_chunk(caplog): splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=6, separators=["."]) chunks = ["chunk1", "chunk2", "chunk3", "chunk4"] - result = splitter._apply_overlap(chunks) - assert result == ["chunk1", "chunk1chunk2", "chunk2chunk3", "chunk3chunk4"] + _ = splitter._apply_overlap(chunks) + assert ( + "Overlap is the same as the previous chunk. Consider increasing the `split_overlap` parameter or decreasing the `split_length` parameter." + in caplog.text + ) def test_apply_overlap_single_chunk(): From 0492025d264a7516dd3169ea1b819f095e10afc4 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 16 Dec 2024 17:20:51 +0100 Subject: [PATCH 46/82] fixing types --- haystack/components/preprocessors/recursive_splitter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index fc421d552f..265dacd3e5 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -4,7 +4,7 @@ import re from copy import deepcopy -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional from haystack import Document, component, logging @@ -56,7 +56,7 @@ def __init__( # pylint: disable=too-many-positional-arguments split_length: int = 200, split_overlap: int = 0, separators: Optional[List[str]] = None, - sentence_splitter_params: Optional[Dict[str, str]] = None, + sentence_splitter_params: Optional[Dict[str, Any]] = None, ): """ Initializes a RecursiveDocumentSplitter. @@ -91,7 +91,7 @@ def _check_params(self): raise ValueError("All separators must be strings.") @staticmethod - def _get_custom_sentence_tokenizer(sentence_splitter_params: Dict[str, str]): + def _get_custom_sentence_tokenizer(sentence_splitter_params: Dict[str, Any]): from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter return SentenceSplitter(**sentence_splitter_params) From 09362e47b6ce85f2683c3d247b1cc2e5ddaf86ba Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 16 Dec 2024 17:36:28 +0100 Subject: [PATCH 47/82] improving docstring --- haystack/components/preprocessors/recursive_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 265dacd3e5..32ba6667d3 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -61,7 +61,7 @@ def __init__( # pylint: disable=too-many-positional-arguments """ Initializes a RecursiveDocumentSplitter. - :param split_length: The maximum length of each chunk. + :param split_length: The maximum length of each chunk in characters. :param split_overlap: The number of characters to overlap between consecutive chunks. :param separators: An optional list of separator strings to use for splitting the text. The string separators will be treated as regular expressions un less if the separator is "sentence", in that case the From eb38a2b1d296e2b08b1f05077831f408b38352cc Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 16 Dec 2024 17:38:57 +0100 Subject: [PATCH 48/82] improving docstring --- .../preprocessors/recursive_splitter.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 32ba6667d3..d7242cc371 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -64,9 +64,10 @@ def __init__( # pylint: disable=too-many-positional-arguments :param split_length: The maximum length of each chunk in characters. :param split_overlap: The number of characters to overlap between consecutive chunks. :param separators: An optional list of separator strings to use for splitting the text. The string - separators will be treated as regular expressions un less if the separator is "sentence", in that case the + separators will be treated as regular expressions unless the separator is "sentence", in that case the text will be split into sentences using a custom sentence tokenizer based on NLTK. - If no separators are provided, the default separators ["\n\n", "\n", ".", " "] are used. + See: haystack.components.preprocessors.sentence_tokenizer.SentenceSplitter. + If no separators are provided, the default separators ["\n\n", "sentence", "\n", " "] are used. :raises ValueError: If the overlap is greater than or equal to the chunk size or if the overlap is negative, or if any separator is not a string. @@ -126,10 +127,10 @@ def _chunk_text(self, text: str) -> List[str]: """ Recursive chunking algorithm that divides text into smaller chunks based on a list of separator characters. - It starts with a list of separator characters (e.g., ["\n\n", "\n", " ", ""]) and attempts to divide the text - using the first separator. If the resulting chunks are still larger than the specified chunk size, it moves to - the next separator in the list. This process continues recursively, progressively applying each specific - separator until the chunks meet the desired size criteria. + It starts with a list of separator characters (e.g., ["\n\n", "sentence", "\n", " "]) and attempts to divide + the text using the first separator. If the resulting chunks are still larger than the specified chunk size, + it moves to the next separator in the list. This process continues recursively, progressively applying each + specific separator until the chunks meet the desired size criteria. :param text: The text to be split into chunks. :returns: @@ -246,7 +247,7 @@ def _run_one(self, doc: Document) -> List[Document]: @component.output_types(documents=List[Document]) def run(self, documents: List[Document]) -> Dict[str, List[Document]]: """ - Split documents into Documents with smaller chunks of text. + Split a list of documents into documents with smaller chunks of text. :param documents: List of Documents to split. :returns: From a418f7339bece1ea0183a586b547a20358b2c427 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 17 Dec 2024 15:50:15 +0100 Subject: [PATCH 49/82] adding custom lenght, 'character' use case --- .../preprocessors/recursive_splitter.py | 31 +++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index d7242cc371..f15951066b 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -4,7 +4,7 @@ import re from copy import deepcopy -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Literal, Optional from haystack import Document, component, logging @@ -55,6 +55,7 @@ def __init__( # pylint: disable=too-many-positional-arguments self, split_length: int = 200, split_overlap: int = 0, + split_units: Literal["words", "char"] = "char", separators: Optional[List[str]] = None, sentence_splitter_params: Optional[Dict[str, Any]] = None, ): @@ -63,6 +64,7 @@ def __init__( # pylint: disable=too-many-positional-arguments :param split_length: The maximum length of each chunk in characters. :param split_overlap: The number of characters to overlap between consecutive chunks. + :param split_units: The unit of the split_length parameter. It can be either "words" or "char". :param separators: An optional list of separator strings to use for splitting the text. The string separators will be treated as regular expressions unless the separator is "sentence", in that case the text will be split into sentences using a custom sentence tokenizer based on NLTK. @@ -74,6 +76,7 @@ def __init__( # pylint: disable=too-many-positional-arguments """ self.split_length = split_length self.split_overlap = split_overlap + self.split_units = split_units self.separators = separators if separators else ["\n\n", "sentence", "\n", " "] # default separators self.sentence_tokenizer_params = sentence_splitter_params self._check_params() @@ -123,6 +126,19 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: return overlapped_chunks + def _chunk_length(self, text: str) -> int: + """ + Get the length of the chunk in words or characters. + + :param text: The text to be split into chunks. + :returns: + The length of the chunk in words or characters. + """ + if self.split_units == "words": + return len(text.split()) + else: + return len(text) + def _chunk_text(self, text: str) -> List[str]: """ Recursive chunking algorithm that divides text into smaller chunks based on a list of separator characters. @@ -136,7 +152,7 @@ def _chunk_text(self, text: str) -> List[str]: :returns: A list of text chunks. """ - if len(text) <= self.split_length: + if self._chunk_length(text) <= self.split_length: return [text] for curr_separator in self.separators: # type: ignore # the caller already checked that separators is not None @@ -170,14 +186,14 @@ def _chunk_text(self, text: str) -> List[str]: for split in splits: split_text = split # if adding this split exceeds chunk_size, process current_chunk - if current_length + len(split_text) > self.split_length: + if current_length + self._chunk_length(split_text) > self.split_length: if current_chunk: # keep the good splits chunks.append("".join(current_chunk)) current_chunk = [] current_length = 0 # recursively handle splits that are too large - if len(split_text) > self.split_length: + if self._chunk_length(split_text) > self.split_length: if curr_separator == self.separators[-1]: # tried the last separator, can't split further, break the loop and fall back to # character-level chunking @@ -187,7 +203,7 @@ def _chunk_text(self, text: str) -> List[str]: chunks.append(split_text) else: current_chunk.append(split_text) - current_length += len(split_text) + current_length += self._chunk_length(split_text) if current_chunk: chunks.append("".join(current_chunk)) @@ -199,7 +215,10 @@ def _chunk_text(self, text: str) -> List[str]: return chunks # if no separator worked, fall back to character-level chunking - return [text[i : i + self.split_length] for i in range(0, len(text), self.split_length - self.split_overlap)] + return [ + text[i : i + self.split_length] + for i in range(0, self._chunk_length(text), self.split_length - self.split_overlap) + ] def _run_one(self, doc: Document) -> List[Document]: new_docs: List[Document] = [] From 71ce15b45177adb43b8de7fa247fef15c6bffa56 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 17 Dec 2024 17:38:19 +0100 Subject: [PATCH 50/82] customising overlap function for word and adding a few tests --- .../preprocessors/recursive_splitter.py | 46 ++++++++++++------- .../preprocessors/test_recursive_splitter.py | 23 +++++++++- 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index f15951066b..a40cd15eef 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -55,7 +55,7 @@ def __init__( # pylint: disable=too-many-positional-arguments self, split_length: int = 200, split_overlap: int = 0, - split_units: Literal["words", "char"] = "char", + split_units: Literal["word", "char"] = "char", separators: Optional[List[str]] = None, sentence_splitter_params: Optional[Dict[str, Any]] = None, ): @@ -114,8 +114,12 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: if idx == 0: overlapped_chunks.append(chunk) continue - overlap_start = max(0, len(chunks[idx - 1]) - self.split_overlap) - overlap = chunks[idx - 1][overlap_start:] + overlap_start = max(0, self._chunk_length(chunks[idx - 1]) - self.split_overlap) + if self.split_units == "word": + word_chunks = chunks[idx - 1].split() + overlap = " ".join(word_chunks[overlap_start:]) + else: + overlap = chunks[idx - 1][overlap_start:] if overlap == chunks[idx - 1]: logger.warning( "Overlap is the same as the previous chunk. " @@ -134,7 +138,7 @@ def _chunk_length(self, text: str) -> int: :returns: The length of the chunk in words or characters. """ - if self.split_units == "words": + if self.split_units == "word": return len(text.split()) else: return len(text) @@ -214,36 +218,44 @@ def _chunk_text(self, text: str) -> List[str]: if chunks: return chunks - # if no separator worked, fall back to character-level chunking + # if no separator worked, fall back to character- or word-level chunking return [ text[i : i + self.split_length] for i in range(0, self._chunk_length(text), self.split_length - self.split_overlap) ] + def _add_overlap_info(self, curr_pos, new_doc, new_docs): + prev_doc = new_docs[-1] + overlap_length = self._chunk_length(prev_doc.content) - (curr_pos - prev_doc.meta["split_idx_start"]) # type: ignore + if overlap_length > 0: + prev_doc.meta["_split_overlap"].append({"doc_id": new_doc.id, "range": (0, overlap_length)}) + new_doc.meta["_split_overlap"].append( + { + "doc_id": prev_doc.id, + "range": ( + self._chunk_length(prev_doc.content) - overlap_length, + self._chunk_length(prev_doc.content), # type: ignore + ), + } + ) + def _run_one(self, doc: Document) -> List[Document]: - new_docs: List[Document] = [] chunks = self._chunk_text(doc.content) # type: ignore # the caller already check for a non-empty doc.content - chunks = chunks[:-1] if len(chunks[-1]) == 0 else chunks # remove last empty chunk + chunks = chunks[:-1] if len(chunks[-1]) == 0 else chunks # remove last empty chunk if it exists current_position = 0 current_page = 1 + new_docs: List[Document] = [] + for split_nr, chunk in enumerate(chunks): new_doc = Document(content=chunk, meta=deepcopy(doc.meta)) new_doc.meta["split_id"] = split_nr new_doc.meta["split_idx_start"] = current_position new_doc.meta["_split_overlap"] = [] if self.split_overlap > 0 else None + # add overlap information to the previous and current doc if split_nr > 0 and self.split_overlap > 0: - previous_doc = new_docs[-1] - overlap_length = len(previous_doc.content) - (current_position - previous_doc.meta["split_idx_start"]) # type: ignore - if overlap_length > 0: - previous_doc.meta["_split_overlap"].append({"doc_id": new_doc.id, "range": (0, overlap_length)}) - new_doc.meta["_split_overlap"].append( - { - "doc_id": previous_doc.id, - "range": (len(previous_doc.content) - overlap_length, len(previous_doc.content)), # type: ignore - } - ) + self._add_overlap_info(current_position, new_doc, new_docs) # count page breaks in the chunk current_page += chunk.count("\f") diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index f7f0bf1467..e819cb32cb 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -375,7 +375,28 @@ def test_recursive_splitter_custom_sentence_tokenizer_document_and_overlap(): assert doc_chunks[2].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (22, 27)}] -def test_run_split_document_with_overlap(): +def test_recursive_splitter_custom_sentence_tokenizer_document_and_overlap_word_unit_no_overlap(): + splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=0, separators=["."], split_units="word") + text = "This is sentence one. This is sentence two. This is sentence three." + chunks = splitter.run([Document(content=text)])["documents"] + assert len(chunks) == 3 + assert chunks[0].content == "This is sentence one." + assert chunks[1].content == " This is sentence two." + assert chunks[2].content == " This is sentence three." + + +def test_recursive_splitter_custom_sentence_tokenizer_document_and_overlap_word_unit_overlap_2_words(): + splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=2, separators=["."], split_units="word") + text = "This is sentence one. This is sentence two. This is sentence three. This is sentence four." + chunks = splitter.run([Document(content=text)])["documents"] + assert len(chunks) == 4 + assert chunks[0].content == "This is sentence one." + assert chunks[1].content == "sentence one. This is sentence two." + assert chunks[2].content == "sentence two. This is sentence three." + assert chunks[3].content == "sentence three. This is sentence four." + + +def test_run_split_document_with_overlap_character_unit(): splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=11, separators=[".", " "]) text = """A simple sentence1. A bright sentence2. A clever sentence3. A joyful sentence4""" From 3a9d290cfc3c1a04937b6726aeba09c1d2a8764d Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 17 Dec 2024 17:49:31 +0100 Subject: [PATCH 51/82] updating docstring --- haystack/components/preprocessors/recursive_splitter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index a40cd15eef..11754d5434 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -62,9 +62,10 @@ def __init__( # pylint: disable=too-many-positional-arguments """ Initializes a RecursiveDocumentSplitter. - :param split_length: The maximum length of each chunk in characters. + :param split_length: The maximum length of each chunk by default in characters, but can be in words. + See the `split_units` parameter. :param split_overlap: The number of characters to overlap between consecutive chunks. - :param split_units: The unit of the split_length parameter. It can be either "words" or "char". + :param split_units: The unit of the split_length parameter. It can be either "word" or "char". :param separators: An optional list of separator strings to use for splitting the text. The string separators will be treated as regular expressions unless the separator is "sentence", in that case the text will be split into sentences using a custom sentence tokenizer based on NLTK. From 938b610b78cedd10de95ac1287fa4fe30cdb65ee Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 19 Dec 2024 10:23:17 +0100 Subject: [PATCH 52/82] Update haystack/components/preprocessors/recursive_splitter.py Co-authored-by: Sebastian Husch Lee --- haystack/components/preprocessors/recursive_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 11754d5434..d61e0ee628 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -225,7 +225,7 @@ def _chunk_text(self, text: str) -> List[str]: for i in range(0, self._chunk_length(text), self.split_length - self.split_overlap) ] - def _add_overlap_info(self, curr_pos, new_doc, new_docs): + def _add_overlap_info(self, curr_pos: int, new_doc: Document, new_docs: List[Document]) -> None: prev_doc = new_docs[-1] overlap_length = self._chunk_length(prev_doc.content) - (curr_pos - prev_doc.meta["split_idx_start"]) # type: ignore if overlap_length > 0: From bc4dfbd86be5deae4c108b3ae710cb0e20b9cd67 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 19 Dec 2024 10:23:28 +0100 Subject: [PATCH 53/82] Update haystack/components/preprocessors/recursive_splitter.py Co-authored-by: Sebastian Husch Lee --- haystack/components/preprocessors/recursive_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index d61e0ee628..022f4db028 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -201,7 +201,7 @@ def _chunk_text(self, text: str) -> List[str]: if self._chunk_length(split_text) > self.split_length: if curr_separator == self.separators[-1]: # tried the last separator, can't split further, break the loop and fall back to - # character-level chunking + # word- or character-level chunking break chunks.extend(self._chunk_text(split_text)) else: From 371028cfedf56914b7ebbcec32ad953084618949 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 19 Dec 2024 10:33:11 +0100 Subject: [PATCH 54/82] Update haystack/components/preprocessors/recursive_splitter.py Co-authored-by: Sebastian Husch Lee --- haystack/components/preprocessors/recursive_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 022f4db028..5511ccd3b0 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -124,7 +124,7 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: if overlap == chunks[idx - 1]: logger.warning( "Overlap is the same as the previous chunk. " - "Consider increasing the `split_overlap` parameter or decreasing the `split_length` parameter." + "Consider increasing the `split_length` parameter or decreasing the `split_overlap` parameter." ) current_chunk = overlap + chunk overlapped_chunks.append(current_chunk) From 79cd8bd7f3a865a73079bea5390ba2e573707dad Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 17 Dec 2024 18:48:37 +0100 Subject: [PATCH 55/82] wip: adding more tests for word unit length --- .../preprocessors/recursive_splitter.py | 24 +- .../preprocessors/test_recursive_splitter.py | 323 +++++++++++++++--- 2 files changed, 287 insertions(+), 60 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 5511ccd3b0..6f81ad28bc 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -55,7 +55,7 @@ def __init__( # pylint: disable=too-many-positional-arguments self, split_length: int = 200, split_overlap: int = 0, - split_units: Literal["word", "char"] = "char", + split_unit: Literal["word", "char"] = "char", separators: Optional[List[str]] = None, sentence_splitter_params: Optional[Dict[str, Any]] = None, ): @@ -65,7 +65,7 @@ def __init__( # pylint: disable=too-many-positional-arguments :param split_length: The maximum length of each chunk by default in characters, but can be in words. See the `split_units` parameter. :param split_overlap: The number of characters to overlap between consecutive chunks. - :param split_units: The unit of the split_length parameter. It can be either "word" or "char". + :param split_unit: The unit of the split_length parameter. It can be either "word" or "char". :param separators: An optional list of separator strings to use for splitting the text. The string separators will be treated as regular expressions unless the separator is "sentence", in that case the text will be split into sentences using a custom sentence tokenizer based on NLTK. @@ -77,7 +77,7 @@ def __init__( # pylint: disable=too-many-positional-arguments """ self.split_length = split_length self.split_overlap = split_overlap - self.split_units = split_units + self.split_units = split_unit self.separators = separators if separators else ["\n\n", "sentence", "\n", " "] # default separators self.sentence_tokenizer_params = sentence_splitter_params self._check_params() @@ -190,8 +190,10 @@ def _chunk_text(self, text: str) -> List[str]: # check splits, if any is too long, recursively chunk it, otherwise add to current chunk for split in splits: split_text = split + # if adding this split exceeds chunk_size, process current_chunk if current_length + self._chunk_length(split_text) > self.split_length: + # process current_chunk if current_chunk: # keep the good splits chunks.append("".join(current_chunk)) current_chunk = [] @@ -220,10 +222,18 @@ def _chunk_text(self, text: str) -> List[str]: return chunks # if no separator worked, fall back to character- or word-level chunking - return [ - text[i : i + self.split_length] - for i in range(0, self._chunk_length(text), self.split_length - self.split_overlap) - ] + # ToDo: refactor into functions that can be easily tested + if self.split_units == "word": + return [ + " ".join(text.split()[i : i + self.split_length]) + for i in range(0, len(text.split()), self.split_length - self.split_overlap) + ] + + if self.split_units == "char": + return [ + text[i : i + self.split_length] + for i in range(0, self._chunk_length(text), self.split_length - self.split_overlap) + ] def _add_overlap_info(self, curr_pos: int, new_doc: Document, new_docs: List[Document]) -> None: prev_doc = new_docs[-1] diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index e819cb32cb..d044d8eff8 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -83,7 +83,7 @@ def test_chunk_text_by_period(): assert chunks[2] == " And one more." -def test_recursive_splitter_multiple_new_lines(): +def test_run_multiple_new_lines(): splitter = RecursiveDocumentSplitter(split_length=20, separators=["\n\n", "\n"]) text = "This is a test.\n\n\nAnother test.\n\n\n\nFinal test." doc = Document(content=text) @@ -93,7 +93,7 @@ def test_recursive_splitter_multiple_new_lines(): assert chunks[2].content == "\n\nFinal test." -def test_recursive_splitter_empty_documents(caplog: LogCaptureFixture): +def test_run_empty_documents(caplog: LogCaptureFixture): splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."]) empty_doc = Document(content="") doc_chunks = splitter.run([empty_doc]) @@ -102,7 +102,7 @@ def test_recursive_splitter_empty_documents(caplog: LogCaptureFixture): assert "has an empty content. Skipping this document." in caplog.text -def test_recursive_splitter_using_custom_sentence_tokenizer(): +def test_run_using_custom_sentence_tokenizer(): """ This test includes abbreviations that are not handled by the simple sentence tokenizer based on "." and requires a more sophisticated sentence tokenizer like the one provided by NLTK. @@ -346,56 +346,6 @@ def test_run_split_by_sentence_count_page_breaks() -> None: assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content) -def test_recursive_splitter_custom_sentence_tokenizer_document_and_overlap(): - """Test that RecursiveDocumentSplitter works correctly with custom sentence tokenizer and overlap""" - splitter = RecursiveDocumentSplitter(split_length=25, split_overlap=5, separators=["sentence"]) - text = "This is sentence one. This is sentence two. This is sentence three." - - doc = Document(content=text) - doc_chunks = splitter.run([doc])["documents"] - - assert len(doc_chunks) == 3 - - assert doc_chunks[0].content == "This is sentence one. " - assert doc_chunks[0].meta["split_id"] == 0 - assert doc_chunks[0].meta["split_idx_start"] == text.index(doc_chunks[0].content) - assert doc_chunks[0].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (0, 5)}] - - assert doc_chunks[1].content == "one. This is sentence two. " - assert doc_chunks[1].meta["split_id"] == 1 - assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content) - assert doc_chunks[1].meta["_split_overlap"] == [ - {"doc_id": doc_chunks[0].id, "range": (17, 22)}, - {"doc_id": doc_chunks[2].id, "range": (0, 5)}, - ] - - assert doc_chunks[2].content == "two. This is sentence three." - assert doc_chunks[2].meta["split_id"] == 2 - assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content) - assert doc_chunks[2].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (22, 27)}] - - -def test_recursive_splitter_custom_sentence_tokenizer_document_and_overlap_word_unit_no_overlap(): - splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=0, separators=["."], split_units="word") - text = "This is sentence one. This is sentence two. This is sentence three." - chunks = splitter.run([Document(content=text)])["documents"] - assert len(chunks) == 3 - assert chunks[0].content == "This is sentence one." - assert chunks[1].content == " This is sentence two." - assert chunks[2].content == " This is sentence three." - - -def test_recursive_splitter_custom_sentence_tokenizer_document_and_overlap_word_unit_overlap_2_words(): - splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=2, separators=["."], split_units="word") - text = "This is sentence one. This is sentence two. This is sentence three. This is sentence four." - chunks = splitter.run([Document(content=text)])["documents"] - assert len(chunks) == 4 - assert chunks[0].content == "This is sentence one." - assert chunks[1].content == "sentence one. This is sentence two." - assert chunks[2].content == "sentence two. This is sentence three." - assert chunks[3].content == "sentence three. This is sentence four." - - def test_run_split_document_with_overlap_character_unit(): splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=11, separators=[".", " "]) text = """A simple sentence1. A bright sentence2. A clever sentence3. A joyful sentence4""" @@ -452,6 +402,273 @@ def test_run_fallback_to_character_chunking(): assert len(chunk.content) <= 2 +def test_run_custom_sentence_tokenizer_document_and_overlap_char_unit(): + """Test that RecursiveDocumentSplitter works correctly with custom sentence tokenizer and overlap""" + splitter = RecursiveDocumentSplitter(split_length=25, split_overlap=5, separators=["sentence"]) + text = "This is sentence one. This is sentence two. This is sentence three." + + doc = Document(content=text) + doc_chunks = splitter.run([doc])["documents"] + + assert len(doc_chunks) == 3 + + assert doc_chunks[0].content == "This is sentence one. " + assert doc_chunks[0].meta["split_id"] == 0 + assert doc_chunks[0].meta["split_idx_start"] == text.index(doc_chunks[0].content) + assert doc_chunks[0].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (0, 5)}] + + assert doc_chunks[1].content == "one. This is sentence two. " + assert doc_chunks[1].meta["split_id"] == 1 + assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content) + assert doc_chunks[1].meta["_split_overlap"] == [ + {"doc_id": doc_chunks[0].id, "range": (17, 22)}, + {"doc_id": doc_chunks[2].id, "range": (0, 5)}, + ] + + assert doc_chunks[2].content == "two. This is sentence three." + assert doc_chunks[2].meta["split_id"] == 2 + assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content) + assert doc_chunks[2].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (22, 27)}] + + +def test_run_split_by_dot_count_page_breaks_word_unit() -> None: + document_splitter = RecursiveDocumentSplitter(separators=["."], split_length=4, split_overlap=0, split_unit="word") + + text = ( + "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" + "Sentence on page 3. Another on page 3.\f\f Sentence on page 5." + ) + + documents = document_splitter.run(documents=[Document(content=text)])["documents"] + + assert len(documents) == 7 + assert documents[0].content == "Sentence on page 1." + assert documents[0].meta["page_number"] == 1 + assert documents[0].meta["split_id"] == 0 + assert documents[0].meta["split_idx_start"] == text.index(documents[0].content) + + assert documents[1].content == " Another on page 1." + assert documents[1].meta["page_number"] == 1 + assert documents[1].meta["split_id"] == 1 + assert documents[1].meta["split_idx_start"] == text.index(documents[1].content) + + assert documents[2].content == "\fSentence on page 2." + assert documents[2].meta["page_number"] == 2 + assert documents[2].meta["split_id"] == 2 + assert documents[2].meta["split_idx_start"] == text.index(documents[2].content) + + assert documents[3].content == " Another on page 2." + assert documents[3].meta["page_number"] == 2 + assert documents[3].meta["split_id"] == 3 + assert documents[3].meta["split_idx_start"] == text.index(documents[3].content) + + assert documents[4].content == "\fSentence on page 3." + assert documents[4].meta["page_number"] == 3 + assert documents[4].meta["split_id"] == 4 + assert documents[4].meta["split_idx_start"] == text.index(documents[4].content) + + assert documents[5].content == " Another on page 3." + assert documents[5].meta["page_number"] == 3 + assert documents[5].meta["split_id"] == 5 + assert documents[5].meta["split_idx_start"] == text.index(documents[5].content) + + assert documents[6].content == "\f\f Sentence on page 5." + assert documents[6].meta["page_number"] == 5 + assert documents[6].meta["split_id"] == 6 + assert documents[6].meta["split_idx_start"] == text.index(documents[6].content) + + +def test_run_split_by_word_count_page_breaks_word_unit(): + splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=0, separators=["w"], split_unit="word") + text = "This is some text. \f This text is on another page. \f This is the last pag3." + doc = Document(content=text) + doc_chunks = splitter.run([doc]) + doc_chunks = doc_chunks["documents"] + + assert len(doc_chunks) == 4 + assert doc_chunks[0].content == "This is some text." + assert doc_chunks[0].meta["page_number"] == 1 + assert doc_chunks[0].meta["split_id"] == 0 + assert doc_chunks[0].meta["split_idx_start"] == text.index(doc_chunks[0].content) + + assert doc_chunks[1].content == " \f This text is on" + assert doc_chunks[1].meta["page_number"] == 2 + assert doc_chunks[1].meta["split_id"] == 1 + assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content) + + assert doc_chunks[2].content == " another page. \f T" + assert doc_chunks[2].meta["page_number"] == 3 + assert doc_chunks[2].meta["split_id"] == 2 + assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content) + + assert doc_chunks[3].content == "his is the last pa" + assert doc_chunks[3].meta["page_number"] == 3 + assert doc_chunks[3].meta["split_id"] == 3 + assert doc_chunks[3].meta["split_idx_start"] == text.index(doc_chunks[3].content) + + assert doc_chunks[4].content == "g3." + assert doc_chunks[4].meta["page_number"] == 3 + assert doc_chunks[4].meta["split_id"] == 4 + assert doc_chunks[4].meta["split_idx_start"] == text.index(doc_chunks[4].content) + + +def test_run_split_by_page_break_count_page_breaks_word_unit() -> None: + document_splitter = RecursiveDocumentSplitter(separators=["\f"], split_length=8, split_overlap=0, split_unit="word") + + text = ( + "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" + "Sentence on page 3. Another on page 3.\f\f Sentence on page 5." + ) + + documents = document_splitter.run(documents=[Document(content=text)]) + chunks_docs = documents["documents"] + + assert len(chunks_docs) == 4 + assert chunks_docs[0].content == "Sentence on page 1. Another on page 1.\f" + assert chunks_docs[0].meta["page_number"] == 1 + assert chunks_docs[0].meta["split_id"] == 0 + assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content) + + assert chunks_docs[1].content == "Sentence on page 2. Another on page 2.\f" + assert chunks_docs[1].meta["page_number"] == 2 + assert chunks_docs[1].meta["split_id"] == 1 + assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content) + + assert chunks_docs[2].content == "Sentence on page 3. Another on page 3.\f\f" + assert chunks_docs[2].meta["page_number"] == 3 + assert chunks_docs[2].meta["split_id"] == 2 + assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content) + + assert chunks_docs[3].content == " Sentence on page 5." + assert chunks_docs[3].meta["page_number"] == 5 + assert chunks_docs[3].meta["split_id"] == 3 + assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content) + + +def test_run_split_by_new_line_count_page_breaks_word_unit() -> None: + document_splitter = RecursiveDocumentSplitter( + separators=["\n"], split_length=21, split_overlap=0, split_unit="word" + ) + + text = ( + "Sentence on page 1.\nAnother on page 1.\n\f" + "Sentence on page 2.\nAnother on page 2.\n\f" + "Sentence on page 3.\nAnother on page 3.\n\f\f" + "Sentence on page 5." + ) + + documents = document_splitter.run(documents=[Document(content=text)]) + chunks_docs = documents["documents"] + + assert len(chunks_docs) == 7 + + assert chunks_docs[0].content == "Sentence on page 1.\n" + assert chunks_docs[0].meta["page_number"] == 1 + assert chunks_docs[0].meta["split_id"] == 0 + assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content) + + assert chunks_docs[1].content == "Another on page 1.\n" + assert chunks_docs[1].meta["page_number"] == 1 + assert chunks_docs[1].meta["split_id"] == 1 + assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content) + + assert chunks_docs[2].content == "\fSentence on page 2.\n" + assert chunks_docs[2].meta["page_number"] == 2 + assert chunks_docs[2].meta["split_id"] == 2 + assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content) + + assert chunks_docs[3].content == "Another on page 2.\n" + assert chunks_docs[3].meta["page_number"] == 2 + assert chunks_docs[3].meta["split_id"] == 3 + assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content) + + assert chunks_docs[4].content == "\fSentence on page 3.\n" + assert chunks_docs[4].meta["page_number"] == 3 + assert chunks_docs[4].meta["split_id"] == 4 + assert chunks_docs[4].meta["split_idx_start"] == text.index(chunks_docs[4].content) + + assert chunks_docs[5].content == "Another on page 3.\n" + assert chunks_docs[5].meta["page_number"] == 3 + assert chunks_docs[5].meta["split_id"] == 5 + assert chunks_docs[5].meta["split_idx_start"] == text.index(chunks_docs[5].content) + + assert chunks_docs[6].content == "\f\fSentence on page 5." + assert chunks_docs[6].meta["page_number"] == 5 + assert chunks_docs[6].meta["split_id"] == 6 + assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content) + + +def test_run_split_by_sentence_count_page_breaks_word_unit() -> None: + document_splitter = RecursiveDocumentSplitter( + separators=["sentence"], split_length=28, split_overlap=0, split_unit="word" + ) + + text = ( + "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" + "Sentence on page 3. Another on page 3.\f\fSentence on page 5." + ) + + documents = document_splitter.run(documents=[Document(content=text)]) + chunks_docs = documents["documents"] + assert len(chunks_docs) == 7 + + assert chunks_docs[0].content == "Sentence on page 1. " + assert chunks_docs[0].meta["page_number"] == 1 + assert chunks_docs[0].meta["split_id"] == 0 + assert chunks_docs[0].meta["split_idx_start"] == text.index(chunks_docs[0].content) + + assert chunks_docs[1].content == "Another on page 1.\f" + assert chunks_docs[1].meta["page_number"] == 1 + assert chunks_docs[1].meta["split_id"] == 1 + assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content) + + assert chunks_docs[2].content == "Sentence on page 2. " + assert chunks_docs[2].meta["page_number"] == 2 + assert chunks_docs[2].meta["split_id"] == 2 + assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content) + + assert chunks_docs[3].content == "Another on page 2.\f" + assert chunks_docs[3].meta["page_number"] == 2 + assert chunks_docs[3].meta["split_id"] == 3 + assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content) + + assert chunks_docs[4].content == "Sentence on page 3. " + assert chunks_docs[4].meta["page_number"] == 3 + assert chunks_docs[4].meta["split_id"] == 4 + assert chunks_docs[4].meta["split_idx_start"] == text.index(chunks_docs[4].content) + + assert chunks_docs[5].content == "Another on page 3.\f\f" + assert chunks_docs[5].meta["page_number"] == 3 + assert chunks_docs[5].meta["split_id"] == 5 + assert chunks_docs[5].meta["split_idx_start"] == text.index(chunks_docs[5].content) + + assert chunks_docs[6].content == "Sentence on page 5." + assert chunks_docs[6].meta["page_number"] == 5 + assert chunks_docs[6].meta["split_id"] == 6 + assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content) + + +def test_run_custom_sentence_tokenizer_document_and_overlap_word_unit_no_overlap(): + splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=0, separators=["."], split_unit="word") + text = "This is sentence one. This is sentence two. This is sentence three." + chunks = splitter.run([Document(content=text)])["documents"] + assert len(chunks) == 3 + assert chunks[0].content == "This is sentence one." + assert chunks[1].content == " This is sentence two." + assert chunks[2].content == " This is sentence three." + + +def test_run_custom_sentence_tokenizer_document_and_overlap_word_unit_overlap_2_words(): + splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=2, separators=["."], split_unit="word") + text = "This is sentence one. This is sentence two. This is sentence three. This is sentence four." + chunks = splitter.run([Document(content=text)])["documents"] + assert len(chunks) == 4 + assert chunks[0].content == "This is sentence one." + assert chunks[1].content == "sentence one. This is sentence two." + assert chunks[2].content == "sentence two. This is sentence three." + assert chunks[3].content == "sentence three. This is sentence four." + + def test_run_serialization_in_pipeline(): pipeline = Pipeline() pipeline.add_component("chunker", RecursiveDocumentSplitter(split_length=20, split_overlap=5, separators=["."])) From 31c8412050f8c4939103434ab7deefbee0859182 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 17 Dec 2024 18:54:10 +0100 Subject: [PATCH 56/82] fix --- haystack/components/preprocessors/recursive_splitter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 6f81ad28bc..9838d3d48a 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -222,11 +222,11 @@ def _chunk_text(self, text: str) -> List[str]: return chunks # if no separator worked, fall back to character- or word-level chunking - # ToDo: refactor into functions that can be easily tested + # ToDo: refactor into a function making use of split_unit parameter that can be easily tested in isolation if self.split_units == "word": return [ " ".join(text.split()[i : i + self.split_length]) - for i in range(0, len(text.split()), self.split_length - self.split_overlap) + for i in range(0, self._chunk_length(text), self.split_length - self.split_overlap) ] if self.split_units == "char": From e1fed92b58cb99574005d9370b11f3c2ba4f9613 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Wed, 18 Dec 2024 12:36:44 +0100 Subject: [PATCH 57/82] feat: `Tool` dataclass - unified abstraction to represent tools (#8652) * draft * del HF token in tests * adaptations * progress * fix type * import sorting * more control on deserialization * release note * improvements * support name field * fix chatpromptbuilder test * port Tool from experimental * release note * docs upd * Update tool.py --------- Co-authored-by: Daria Fokina --- docs/pydoc/config/data_classess_api.yml | 2 +- haystack/dataclasses/__init__.py | 2 + haystack/dataclasses/tool.py | 243 ++++++++++++++ pyproject.toml | 3 +- .../tool-dataclass-12756077bbfea3a1.yaml | 8 + test/dataclasses/test_tool.py | 305 ++++++++++++++++++ 6 files changed, 561 insertions(+), 2 deletions(-) create mode 100644 haystack/dataclasses/tool.py create mode 100644 releasenotes/notes/tool-dataclass-12756077bbfea3a1.yaml create mode 100644 test/dataclasses/test_tool.py diff --git a/docs/pydoc/config/data_classess_api.yml b/docs/pydoc/config/data_classess_api.yml index a67f28db9d..71ea77513a 100644 --- a/docs/pydoc/config/data_classess_api.yml +++ b/docs/pydoc/config/data_classess_api.yml @@ -2,7 +2,7 @@ loaders: - type: haystack_pydoc_tools.loaders.CustomPythonLoader search_path: [../../../haystack/dataclasses] modules: - ["answer", "byte_stream", "chat_message", "document", "streaming_chunk", "sparse_embedding"] + ["answer", "byte_stream", "chat_message", "document", "streaming_chunk", "sparse_embedding", "tool"] ignore_when_discovered: ["__init__"] processors: - type: filter diff --git a/haystack/dataclasses/__init__.py b/haystack/dataclasses/__init__.py index 91e8f0408f..97f253e805 100644 --- a/haystack/dataclasses/__init__.py +++ b/haystack/dataclasses/__init__.py @@ -8,6 +8,7 @@ from haystack.dataclasses.document import Document from haystack.dataclasses.sparse_embedding import SparseEmbedding from haystack.dataclasses.streaming_chunk import StreamingChunk +from haystack.dataclasses.tool import Tool __all__ = [ "Document", @@ -22,4 +23,5 @@ "TextContent", "StreamingChunk", "SparseEmbedding", + "Tool", ] diff --git a/haystack/dataclasses/tool.py b/haystack/dataclasses/tool.py new file mode 100644 index 0000000000..3df3fd18f2 --- /dev/null +++ b/haystack/dataclasses/tool.py @@ -0,0 +1,243 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import inspect +from dataclasses import asdict, dataclass +from typing import Any, Callable, Dict, Optional + +from pydantic import create_model + +from haystack.lazy_imports import LazyImport +from haystack.utils import deserialize_callable, serialize_callable + +with LazyImport(message="Run 'pip install jsonschema'") as jsonschema_import: + from jsonschema import Draft202012Validator + from jsonschema.exceptions import SchemaError + + +class ToolInvocationError(Exception): + """ + Exception raised when a Tool invocation fails. + """ + + pass + + +class SchemaGenerationError(Exception): + """ + Exception raised when automatic schema generation fails. + """ + + pass + + +@dataclass +class Tool: + """ + Data class representing a Tool that Language Models can prepare a call for. + + Accurate definitions of the textual attributes such as `name` and `description` + are important for the Language Model to correctly prepare the call. + + :param name: + Name of the Tool. + :param description: + Description of the Tool. + :param parameters: + A JSON schema defining the parameters expected by the Tool. + :param function: + The function that will be invoked when the Tool is called. + """ + + name: str + description: str + parameters: Dict[str, Any] + function: Callable + + def __post_init__(self): + jsonschema_import.check() + # Check that the parameters define a valid JSON schema + try: + Draft202012Validator.check_schema(self.parameters) + except SchemaError as e: + raise ValueError("The provided parameters do not define a valid JSON schema") from e + + @property + def tool_spec(self) -> Dict[str, Any]: + """ + Return the Tool specification to be used by the Language Model. + """ + return {"name": self.name, "description": self.description, "parameters": self.parameters} + + def invoke(self, **kwargs) -> Any: + """ + Invoke the Tool with the provided keyword arguments. + """ + + try: + result = self.function(**kwargs) + except Exception as e: + raise ToolInvocationError(f"Failed to invoke Tool `{self.name}` with parameters {kwargs}") from e + return result + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the Tool to a dictionary. + + :returns: + Dictionary with serialized data. + """ + + serialized = asdict(self) + serialized["function"] = serialize_callable(self.function) + return serialized + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "Tool": + """ + Deserializes the Tool from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized Tool. + """ + data["function"] = deserialize_callable(data["function"]) + return cls(**data) + + @classmethod + def from_function(cls, function: Callable, name: Optional[str] = None, description: Optional[str] = None) -> "Tool": + """ + Create a Tool instance from a function. + + ### Usage example + + ```python + from typing import Annotated, Literal + from haystack.dataclasses import Tool + + def get_weather( + city: Annotated[str, "the city for which to get the weather"] = "Munich", + unit: Annotated[Literal["Celsius", "Fahrenheit"], "the unit for the temperature"] = "Celsius"): + '''A simple function to get the current weather for a location.''' + return f"Weather report for {city}: 20 {unit}, sunny" + + tool = Tool.from_function(get_weather) + + print(tool) + >>> Tool(name='get_weather', description='A simple function to get the current weather for a location.', + >>> parameters={ + >>> 'type': 'object', + >>> 'properties': { + >>> 'city': {'type': 'string', 'description': 'the city for which to get the weather', 'default': 'Munich'}, + >>> 'unit': { + >>> 'type': 'string', + >>> 'enum': ['Celsius', 'Fahrenheit'], + >>> 'description': 'the unit for the temperature', + >>> 'default': 'Celsius', + >>> }, + >>> } + >>> }, + >>> function=) + ``` + + :param function: + The function to be converted into a Tool. + The function must include type hints for all parameters. + If a parameter is annotated using `typing.Annotated`, its metadata will be used as parameter description. + :param name: + The name of the Tool. If not provided, the name of the function will be used. + :param description: + The description of the Tool. If not provided, the docstring of the function will be used. + To intentionally leave the description empty, pass an empty string. + + :returns: + The Tool created from the function. + + :raises ValueError: + If any parameter of the function lacks a type hint. + :raises SchemaGenerationError: + If there is an error generating the JSON schema for the Tool. + """ + + tool_description = description if description is not None else (function.__doc__ or "") + + signature = inspect.signature(function) + + # collect fields (types and defaults) and descriptions from function parameters + fields: Dict[str, Any] = {} + descriptions = {} + + for param_name, param in signature.parameters.items(): + if param.annotation is param.empty: + raise ValueError(f"Function '{function.__name__}': parameter '{param_name}' does not have a type hint.") + + # if the parameter has not a default value, Pydantic requires an Ellipsis (...) + # to explicitly indicate that the parameter is required + default = param.default if param.default is not param.empty else ... + fields[param_name] = (param.annotation, default) + + if hasattr(param.annotation, "__metadata__"): + descriptions[param_name] = param.annotation.__metadata__[0] + + # create Pydantic model and generate JSON schema + try: + model = create_model(function.__name__, **fields) + schema = model.model_json_schema() + except Exception as e: + raise SchemaGenerationError(f"Failed to create JSON schema for function '{function.__name__}'") from e + + # we don't want to include title keywords in the schema, as they contain redundant information + # there is no programmatic way to prevent Pydantic from adding them, so we remove them later + # see https://github.com/pydantic/pydantic/discussions/8504 + _remove_title_from_schema(schema) + + # add parameters descriptions to the schema + for param_name, param_description in descriptions.items(): + if param_name in schema["properties"]: + schema["properties"][param_name]["description"] = param_description + + return Tool(name=name or function.__name__, description=tool_description, parameters=schema, function=function) + + +def _remove_title_from_schema(schema: Dict[str, Any]): + """ + Remove the 'title' keyword from JSON schema and contained property schemas. + + :param schema: + The JSON schema to remove the 'title' keyword from. + """ + schema.pop("title", None) + + for property_schema in schema["properties"].values(): + for key in list(property_schema.keys()): + if key == "title": + del property_schema[key] + + +def deserialize_tools_inplace(data: Dict[str, Any], key: str = "tools"): + """ + Deserialize Tools in a dictionary inplace. + + :param data: + The dictionary with the serialized data. + :param key: + The key in the dictionary where the Tools are stored. + """ + if key in data: + serialized_tools = data[key] + + if serialized_tools is None: + return + + if not isinstance(serialized_tools, list): + raise TypeError(f"The value of '{key}' is not a list") + + deserialized_tools = [] + for tool in serialized_tools: + if not isinstance(tool, dict): + raise TypeError(f"Serialized tool '{tool}' is not a dictionary") + deserialized_tools.append(Tool.from_dict(tool)) + + data[key] = deserialized_tools diff --git a/pyproject.toml b/pyproject.toml index c41c429ced..c1fddc8704 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ dependencies = [ "tenacity!=8.4.0", "lazy-imports", "openai>=1.56.1", + "pydantic", "Jinja2", "posthog", # telemetry "pyyaml", @@ -113,7 +114,7 @@ extra-dependencies = [ "jsonref", # OpenAPIServiceConnector, OpenAPIServiceToFunctions "openapi3", - # Validation + # JsonSchemaValidator, Tool "jsonschema", # Tracing diff --git a/releasenotes/notes/tool-dataclass-12756077bbfea3a1.yaml b/releasenotes/notes/tool-dataclass-12756077bbfea3a1.yaml new file mode 100644 index 0000000000..b6255ee1a9 --- /dev/null +++ b/releasenotes/notes/tool-dataclass-12756077bbfea3a1.yaml @@ -0,0 +1,8 @@ +--- +highlights: > + We are introducing the `Tool` dataclass: a simple and unified abstraction to represent tools throughout the framework. + By building on this abstraction, we will enable support for tools in Chat Generators, + providing a consistent experience across models. +features: + - | + Added a new `Tool` dataclass to represent a tool for which Language Models can prepare calls. diff --git a/test/dataclasses/test_tool.py b/test/dataclasses/test_tool.py new file mode 100644 index 0000000000..db9719a7f3 --- /dev/null +++ b/test/dataclasses/test_tool.py @@ -0,0 +1,305 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Literal, Optional + +import pytest + +from haystack.dataclasses.tool import ( + SchemaGenerationError, + Tool, + ToolInvocationError, + _remove_title_from_schema, + deserialize_tools_inplace, +) + +try: + from typing import Annotated +except ImportError: + from typing_extensions import Annotated + + +def get_weather_report(city: str) -> str: + return f"Weather report for {city}: 20°C, sunny" + + +parameters = {"type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"]} + + +def function_with_docstring(city: str) -> str: + """Get weather report for a city.""" + return f"Weather report for {city}: 20°C, sunny" + + +class TestTool: + def test_init(self): + tool = Tool( + name="weather", description="Get weather report", parameters=parameters, function=get_weather_report + ) + + assert tool.name == "weather" + assert tool.description == "Get weather report" + assert tool.parameters == parameters + assert tool.function == get_weather_report + + def test_init_invalid_parameters(self): + parameters = {"type": "invalid", "properties": {"city": {"type": "string"}}} + + with pytest.raises(ValueError): + Tool(name="irrelevant", description="irrelevant", parameters=parameters, function=get_weather_report) + + def test_tool_spec(self): + tool = Tool( + name="weather", description="Get weather report", parameters=parameters, function=get_weather_report + ) + + assert tool.tool_spec == {"name": "weather", "description": "Get weather report", "parameters": parameters} + + def test_invoke(self): + tool = Tool( + name="weather", description="Get weather report", parameters=parameters, function=get_weather_report + ) + + assert tool.invoke(city="Berlin") == "Weather report for Berlin: 20°C, sunny" + + def test_invoke_fail(self): + tool = Tool( + name="weather", description="Get weather report", parameters=parameters, function=get_weather_report + ) + + with pytest.raises(ToolInvocationError): + tool.invoke() + + def test_to_dict(self): + tool = Tool( + name="weather", description="Get weather report", parameters=parameters, function=get_weather_report + ) + + assert tool.to_dict() == { + "name": "weather", + "description": "Get weather report", + "parameters": parameters, + "function": "test_tool.get_weather_report", + } + + def test_from_dict(self): + tool_dict = { + "name": "weather", + "description": "Get weather report", + "parameters": parameters, + "function": "test_tool.get_weather_report", + } + + tool = Tool.from_dict(tool_dict) + + assert tool.name == "weather" + assert tool.description == "Get weather report" + assert tool.parameters == parameters + assert tool.function == get_weather_report + + def test_from_function_description_from_docstring(self): + tool = Tool.from_function(function=function_with_docstring) + + assert tool.name == "function_with_docstring" + assert tool.description == "Get weather report for a city." + assert tool.parameters == {"type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"]} + assert tool.function == function_with_docstring + + def test_from_function_with_empty_description(self): + tool = Tool.from_function(function=function_with_docstring, description="") + + assert tool.name == "function_with_docstring" + assert tool.description == "" + assert tool.parameters == {"type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"]} + assert tool.function == function_with_docstring + + def test_from_function_with_custom_description(self): + tool = Tool.from_function(function=function_with_docstring, description="custom description") + + assert tool.name == "function_with_docstring" + assert tool.description == "custom description" + assert tool.parameters == {"type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"]} + assert tool.function == function_with_docstring + + def test_from_function_with_custom_name(self): + tool = Tool.from_function(function=function_with_docstring, name="custom_name") + + assert tool.name == "custom_name" + assert tool.description == "Get weather report for a city." + assert tool.parameters == {"type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"]} + assert tool.function == function_with_docstring + + def test_from_function_missing_type_hint(self): + def function_missing_type_hint(city) -> str: + return f"Weather report for {city}: 20°C, sunny" + + with pytest.raises(ValueError): + Tool.from_function(function=function_missing_type_hint) + + def test_from_function_schema_generation_error(self): + def function_with_invalid_type_hint(city: "invalid") -> str: + return f"Weather report for {city}: 20°C, sunny" + + with pytest.raises(SchemaGenerationError): + Tool.from_function(function=function_with_invalid_type_hint) + + def test_from_function_annotated(self): + def function_with_annotations( + city: Annotated[str, "the city for which to get the weather"] = "Munich", + unit: Annotated[Literal["Celsius", "Fahrenheit"], "the unit for the temperature"] = "Celsius", + nullable_param: Annotated[Optional[str], "a nullable parameter"] = None, + ) -> str: + """A simple function to get the current weather for a location.""" + return f"Weather report for {city}: 20 {unit}, sunny" + + tool = Tool.from_function(function=function_with_annotations) + + assert tool.name == "function_with_annotations" + assert tool.description == "A simple function to get the current weather for a location." + assert tool.parameters == { + "type": "object", + "properties": { + "city": {"type": "string", "description": "the city for which to get the weather", "default": "Munich"}, + "unit": { + "type": "string", + "enum": ["Celsius", "Fahrenheit"], + "description": "the unit for the temperature", + "default": "Celsius", + }, + "nullable_param": { + "anyOf": [{"type": "string"}, {"type": "null"}], + "description": "a nullable parameter", + "default": None, + }, + }, + } + + +def test_deserialize_tools_inplace(): + tool = Tool(name="weather", description="Get weather report", parameters=parameters, function=get_weather_report) + serialized_tool = tool.to_dict() + print(serialized_tool) + + data = {"tools": [serialized_tool.copy()]} + deserialize_tools_inplace(data) + assert data["tools"] == [tool] + + data = {"mytools": [serialized_tool.copy()]} + deserialize_tools_inplace(data, key="mytools") + assert data["mytools"] == [tool] + + data = {"no_tools": 123} + deserialize_tools_inplace(data) + assert data == {"no_tools": 123} + + +def test_deserialize_tools_inplace_failures(): + data = {"key": "value"} + deserialize_tools_inplace(data) + assert data == {"key": "value"} + + data = {"tools": None} + deserialize_tools_inplace(data) + assert data == {"tools": None} + + data = {"tools": "not a list"} + with pytest.raises(TypeError): + deserialize_tools_inplace(data) + + data = {"tools": ["not a dictionary"]} + with pytest.raises(TypeError): + deserialize_tools_inplace(data) + + +def test_remove_title_from_schema(): + complex_schema = { + "properties": { + "parameter1": { + "anyOf": [{"type": "string"}, {"type": "integer"}], + "default": "default_value", + "title": "Parameter1", + }, + "parameter2": { + "default": [1, 2, 3], + "items": {"anyOf": [{"type": "string"}, {"type": "integer"}]}, + "title": "Parameter2", + "type": "array", + }, + "parameter3": { + "anyOf": [ + {"type": "string"}, + {"type": "integer"}, + {"items": {"anyOf": [{"type": "string"}, {"type": "integer"}]}, "type": "array"}, + ], + "default": 42, + "title": "Parameter3", + }, + "parameter4": { + "anyOf": [{"type": "string"}, {"items": {"type": "integer"}, "type": "array"}, {"type": "object"}], + "default": {"key": "value"}, + "title": "Parameter4", + }, + }, + "title": "complex_function", + "type": "object", + } + + _remove_title_from_schema(complex_schema) + + assert complex_schema == { + "properties": { + "parameter1": {"anyOf": [{"type": "string"}, {"type": "integer"}], "default": "default_value"}, + "parameter2": { + "default": [1, 2, 3], + "items": {"anyOf": [{"type": "string"}, {"type": "integer"}]}, + "type": "array", + }, + "parameter3": { + "anyOf": [ + {"type": "string"}, + {"type": "integer"}, + {"items": {"anyOf": [{"type": "string"}, {"type": "integer"}]}, "type": "array"}, + ], + "default": 42, + }, + "parameter4": { + "anyOf": [{"type": "string"}, {"items": {"type": "integer"}, "type": "array"}, {"type": "object"}], + "default": {"key": "value"}, + }, + }, + "type": "object", + } + + +def test_remove_title_from_schema_do_not_remove_title_property(): + """Test that the utility function only removes the 'title' keywords and not the 'title' property (if present).""" + schema = { + "properties": { + "parameter1": {"type": "string", "title": "Parameter1"}, + "title": {"type": "string", "title": "Title"}, + }, + "title": "complex_function", + "type": "object", + } + + _remove_title_from_schema(schema) + + assert schema == {"properties": {"parameter1": {"type": "string"}, "title": {"type": "string"}}, "type": "object"} + + +def test_remove_title_from_schema_handle_no_title_in_top_level(): + schema = { + "properties": { + "parameter1": {"type": "string", "title": "Parameter1"}, + "parameter2": {"type": "integer", "title": "Parameter2"}, + }, + "type": "object", + } + + _remove_title_from_schema(schema) + + assert schema == { + "properties": {"parameter1": {"type": "string"}, "parameter2": {"type": "integer"}}, + "type": "object", + } From f71a22bacd123adc68dc4b594c9d4d15c7a843ae Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Wed, 18 Dec 2024 21:34:57 +0100 Subject: [PATCH 58/82] fix: fix deserialization issues in multi-threading environments (#8651) --- haystack/core/pipeline/base.py | 5 ++--- haystack/utils/type_serialization.py | 20 ++++++++++++++++++- ...d-safe-module-import-ed04ad216820ab85.yaml | 4 ++++ 3 files changed, 25 insertions(+), 4 deletions(-) create mode 100644 releasenotes/notes/thread-safe-module-import-ed04ad216820ab85.yaml diff --git a/haystack/core/pipeline/base.py b/haystack/core/pipeline/base.py index 31ad2ad93c..d8f2a65932 100644 --- a/haystack/core/pipeline/base.py +++ b/haystack/core/pipeline/base.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -import importlib import itertools from collections import defaultdict from copy import deepcopy @@ -26,7 +25,7 @@ from haystack.core.serialization import DeserializationCallbacks, component_from_dict, component_to_dict from haystack.core.type_utils import _type_name, _types_are_compatible from haystack.marshal import Marshaller, YamlMarshaller -from haystack.utils import is_in_jupyter +from haystack.utils import is_in_jupyter, type_serialization from .descriptions import find_pipeline_inputs, find_pipeline_outputs from .draw import _to_mermaid_image @@ -161,7 +160,7 @@ def from_dict( # Import the module first... module, _ = component_data["type"].rsplit(".", 1) logger.debug("Trying to import module {module_name}", module_name=module) - importlib.import_module(module) + type_serialization.thread_safe_import(module) # ...then try again if component_data["type"] not in component.registry: raise PipelineError( diff --git a/haystack/utils/type_serialization.py b/haystack/utils/type_serialization.py index b2dd319d52..5ffb505bb1 100644 --- a/haystack/utils/type_serialization.py +++ b/haystack/utils/type_serialization.py @@ -6,10 +6,14 @@ import inspect import sys import typing +from threading import Lock +from types import ModuleType from typing import Any, get_args, get_origin from haystack import DeserializationError +_import_lock = Lock() + def serialize_type(target: Any) -> str: """ @@ -132,7 +136,7 @@ def parse_generic_args(args_str): module = sys.modules.get(module_name) if not module: try: - module = importlib.import_module(module_name) + module = thread_safe_import(module_name) except ImportError as e: raise DeserializationError(f"Could not import the module: {module_name}") from e @@ -141,3 +145,17 @@ def parse_generic_args(args_str): raise DeserializationError(f"Could not locate the type: {type_name} in the module: {module_name}") return deserialized_type + + +def thread_safe_import(module_name: str) -> ModuleType: + """ + Import a module in a thread-safe manner. + + Importing modules in a multi-threaded environment can lead to race conditions. + This function ensures that the module is imported in a thread-safe manner without having impact + on the performance of the import for single-threaded environments. + + :param module_name: the module to import + """ + with _import_lock: + return importlib.import_module(module_name) diff --git a/releasenotes/notes/thread-safe-module-import-ed04ad216820ab85.yaml b/releasenotes/notes/thread-safe-module-import-ed04ad216820ab85.yaml new file mode 100644 index 0000000000..3f1a0a2e78 --- /dev/null +++ b/releasenotes/notes/thread-safe-module-import-ed04ad216820ab85.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + Fixes issues with deserialization of components in multi-threaded environments. From 211c4ed36e8b7a4f75ecf647a75afac32a7752e3 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 19 Dec 2024 10:22:51 +0100 Subject: [PATCH 59/82] adding 'word' as default length --- haystack/components/preprocessors/recursive_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 9838d3d48a..e9cd40f912 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -55,7 +55,7 @@ def __init__( # pylint: disable=too-many-positional-arguments self, split_length: int = 200, split_overlap: int = 0, - split_unit: Literal["word", "char"] = "char", + split_unit: Literal["word", "char"] = "word", separators: Optional[List[str]] = None, sentence_splitter_params: Optional[Dict[str, Any]] = None, ): From 0807902df64fbf402fff0957d2f9edab637168f3 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 19 Dec 2024 11:44:04 +0100 Subject: [PATCH 60/82] fixing types --- .../preprocessors/recursive_splitter.py | 65 ++++++++++++++----- .../preprocessors/test_recursive_splitter.py | 57 +++++++++------- 2 files changed, 82 insertions(+), 40 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index e9cd40f912..8223294f5f 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -51,8 +51,9 @@ class RecursiveDocumentSplitter: >] """ # noqa: E501 - def __init__( # pylint: disable=too-many-positional-arguments + def __init__( self, + *, split_length: int = 200, split_overlap: int = 0, split_unit: Literal["word", "char"] = "word", @@ -71,6 +72,8 @@ def __init__( # pylint: disable=too-many-positional-arguments text will be split into sentences using a custom sentence tokenizer based on NLTK. See: haystack.components.preprocessors.sentence_tokenizer.SentenceSplitter. If no separators are provided, the default separators ["\n\n", "sentence", "\n", " "] are used. + :param sentence_splitter_params: Optional parameters to pass to the sentence tokenizer. + See: haystack.components.preprocessors.sentence_tokenizer.SentenceSplitter for more information. :raises ValueError: If the overlap is greater than or equal to the chunk size or if the overlap is negative, or if any separator is not a string. @@ -81,9 +84,20 @@ def __init__( # pylint: disable=too-many-positional-arguments self.separators = separators if separators else ["\n\n", "sentence", "\n", " "] # default separators self.sentence_tokenizer_params = sentence_splitter_params self._check_params() + self.nltk_tokenizer = None if "sentence" in self.separators: - sentence_splitter_params = sentence_splitter_params or {"keep_white_spaces": True} - self.nltk_tokenizer = self._get_custom_sentence_tokenizer(sentence_splitter_params) + self.warm_up(sentence_splitter_params) + + def warm_up(self, sentence_splitter_params): + """ + Warm up the sentence tokenizer. + + :param sentence_splitter_params: Optional parameters to pass to the sentence tokenizer. + :returns: + An instance of the SentenceSplitter. + """ + sentence_splitter_params = sentence_splitter_params or {"keep_white_spaces": True} + self.nltk_tokenizer = self._get_custom_sentence_tokenizer(sentence_splitter_params) def _check_params(self): if self.split_length < 1: @@ -162,7 +176,8 @@ def _chunk_text(self, text: str) -> List[str]: for curr_separator in self.separators: # type: ignore # the caller already checked that separators is not None if curr_separator == "sentence": - sentence_with_spans = self.nltk_tokenizer.split_sentences(text) + # correct SentenceSplitter initialization is checked at the initialization of the component + sentence_with_spans = self.nltk_tokenizer.split_sentences(text) # type: ignore splits = [sentence["sentence"] for sentence in sentence_with_spans] else: escaped_separator = re.escape(curr_separator) @@ -221,19 +236,37 @@ def _chunk_text(self, text: str) -> List[str]: if chunks: return chunks - # if no separator worked, fall back to character- or word-level chunking - # ToDo: refactor into a function making use of split_unit parameter that can be easily tested in isolation + # if no separator worked, fall back to word- or character-level chunking if self.split_units == "word": - return [ - " ".join(text.split()[i : i + self.split_length]) - for i in range(0, self._chunk_length(text), self.split_length - self.split_overlap) - ] + return self.fall_back_to_word_level_chunking(text) - if self.split_units == "char": - return [ - text[i : i + self.split_length] - for i in range(0, self._chunk_length(text), self.split_length - self.split_overlap) - ] + return self.fall_back_to_char_level_chunking(text) + + def fall_back_to_word_level_chunking(self, text: str) -> List[str]: + """ + Fall back to word-level chunking if no separator works. + + :param text: The text to be split into chunks. + :returns: + A list of text chunks. + """ + return [ + " ".join(text.split()[i : i + self.split_length]) + for i in range(0, self._chunk_length(text), self.split_length - self.split_overlap) + ] + + def fall_back_to_char_level_chunking(self, text: str) -> List[str]: + """ + Fall back to character-level chunking if no separator works. + + :param text: The text to be split into chunks. + :returns: + A list of text chunks. + """ + return [ + text[i : i + self.split_length] + for i in range(0, self._chunk_length(text), self.split_length - self.split_overlap) + ] def _add_overlap_info(self, curr_pos: int, new_doc: Document, new_docs: List[Document]) -> None: prev_doc = new_docs[-1] @@ -244,7 +277,7 @@ def _add_overlap_info(self, curr_pos: int, new_doc: Document, new_docs: List[Doc { "doc_id": prev_doc.id, "range": ( - self._chunk_length(prev_doc.content) - overlap_length, + self._chunk_length(prev_doc.content) - overlap_length, # type: ignore self._chunk_length(prev_doc.content), # type: ignore ), } diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index d044d8eff8..4ac5582386 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -33,7 +33,7 @@ def test_init_with_negative_split_length(): def test_apply_overlap_no_overlap(): # Test the case where there is no overlap between chunks - splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."]) + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."], split_unit="char") chunks = ["chunk1", "chunk2", "chunk3"] result = splitter._apply_overlap(chunks) assert result == ["chunk1", "chunk2", "chunk3"] @@ -41,14 +41,14 @@ def test_apply_overlap_no_overlap(): def test_apply_overlap_with_overlap(): # Test the case where there is overlap between chunks - splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=4, separators=["."]) + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=4, separators=["."], split_unit="char") chunks = ["chunk1", "chunk2", "chunk3"] result = splitter._apply_overlap(chunks) assert result == ["chunk1", "unk1chunk2", "unk2chunk3"] def test_apply_overlap_with_overlap_capturing_completely_previous_chunk(caplog): - splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=6, separators=["."]) + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=6, separators=["."], split_unit="char") chunks = ["chunk1", "chunk2", "chunk3", "chunk4"] _ = splitter._apply_overlap(chunks) assert ( @@ -59,7 +59,7 @@ def test_apply_overlap_with_overlap_capturing_completely_previous_chunk(caplog): def test_apply_overlap_single_chunk(): # Test the case where there is only one chunk - splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=3, separators=["."]) + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=3, separators=["."], split_unit="char") chunks = ["chunk1"] result = splitter._apply_overlap(chunks) assert result == ["chunk1"] @@ -74,7 +74,7 @@ def test_chunk_text_smaller_than_chunk_size(): def test_chunk_text_by_period(): - splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."]) + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."], split_unit="char") text = "This is a test. Another sentence. And one more." chunks = splitter._chunk_text(text) assert len(chunks) == 3 @@ -84,7 +84,7 @@ def test_chunk_text_by_period(): def test_run_multiple_new_lines(): - splitter = RecursiveDocumentSplitter(split_length=20, separators=["\n\n", "\n"]) + splitter = RecursiveDocumentSplitter(split_length=20, separators=["\n\n", "\n"], split_unit="char") text = "This is a test.\n\n\nAnother test.\n\n\n\nFinal test." doc = Document(content=text) chunks = splitter.run([doc])["documents"] @@ -110,6 +110,7 @@ def test_run_using_custom_sentence_tokenizer(): splitter = RecursiveDocumentSplitter( split_length=400, split_overlap=0, + split_unit="char", separators=["\n\n", "\n", "sentence", " "], sentence_splitter_params={"language": "en", "use_split_rules": True, "keep_white_spaces": False}, ) @@ -134,8 +135,8 @@ def test_run_using_custom_sentence_tokenizer(): ) # noqa: E501 -def test_run_split_by_dot_count_page_breaks() -> None: - document_splitter = RecursiveDocumentSplitter(separators=["."], split_length=30, split_overlap=0) +def test_run_split_by_dot_count_page_breaks_split_unit_char() -> None: + document_splitter = RecursiveDocumentSplitter(separators=["."], split_length=30, split_overlap=0, split_unit="char") text = ( "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" @@ -181,8 +182,8 @@ def test_run_split_by_dot_count_page_breaks() -> None: assert documents[6].meta["split_idx_start"] == text.index(documents[6].content) -def test_run_split_by_word_count_page_breaks(): - splitter = RecursiveDocumentSplitter(split_length=18, split_overlap=0, separators=["w"]) +def test_run_split_by_word_count_page_breaks_split_unit_char(): + splitter = RecursiveDocumentSplitter(split_length=18, split_overlap=0, separators=["w"], split_unit="char") text = "This is some text. \f This text is on another page. \f This is the last pag3." doc = Document(content=text) doc_chunks = splitter.run([doc]) @@ -216,7 +217,9 @@ def test_run_split_by_word_count_page_breaks(): def test_run_split_by_page_break_count_page_breaks() -> None: - document_splitter = RecursiveDocumentSplitter(separators=["\f"], split_length=50, split_overlap=0) + document_splitter = RecursiveDocumentSplitter( + separators=["\f"], split_length=50, split_overlap=0, split_unit="char" + ) text = ( "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" @@ -247,8 +250,10 @@ def test_run_split_by_page_break_count_page_breaks() -> None: assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content) -def test_run_split_by_new_line_count_page_breaks() -> None: - document_splitter = RecursiveDocumentSplitter(separators=["\n"], split_length=21, split_overlap=0) +def test_run_split_by_new_line_count_page_breaks_split_unit_char() -> None: + document_splitter = RecursiveDocumentSplitter( + separators=["\n"], split_length=21, split_overlap=0, split_unit="char" + ) text = ( "Sentence on page 1.\nAnother on page 1.\n\f" @@ -298,8 +303,10 @@ def test_run_split_by_new_line_count_page_breaks() -> None: assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content) -def test_run_split_by_sentence_count_page_breaks() -> None: - document_splitter = RecursiveDocumentSplitter(separators=["sentence"], split_length=28, split_overlap=0) +def test_run_split_by_sentence_count_page_breaks_split_unit_char() -> None: + document_splitter = RecursiveDocumentSplitter( + separators=["sentence"], split_length=28, split_overlap=0, split_unit="char" + ) text = ( "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" @@ -347,7 +354,7 @@ def test_run_split_by_sentence_count_page_breaks() -> None: def test_run_split_document_with_overlap_character_unit(): - splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=11, separators=[".", " "]) + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=11, separators=[".", " "], split_unit="char") text = """A simple sentence1. A bright sentence2. A clever sentence3. A joyful sentence4""" doc = Document(content=text) @@ -384,7 +391,7 @@ def test_run_split_document_with_overlap_character_unit(): def test_run_separator_exists_but_split_length_too_small_fall_back_to_character_chunking(): - splitter = RecursiveDocumentSplitter(separators=[" "], split_length=2) + splitter = RecursiveDocumentSplitter(separators=[" "], split_length=2, split_unit="char") doc = Document(content="This is some text. This is some more text.") result = splitter.run(documents=[doc]) assert len(result["documents"]) == 21 @@ -392,10 +399,10 @@ def test_run_separator_exists_but_split_length_too_small_fall_back_to_character_ assert len(doc.content) == 2 -def test_run_fallback_to_character_chunking(): +def test_run_fallback_to_character_chunking_by_default_length_too_short(): text = "abczdefzghizjkl" separators = ["\n\n", "\n", "z"] - splitter = RecursiveDocumentSplitter(split_length=2, separators=separators) + splitter = RecursiveDocumentSplitter(split_length=2, separators=separators, split_unit="char") doc = Document(content=text) chunks = splitter.run([doc])["documents"] for chunk in chunks: @@ -404,7 +411,7 @@ def test_run_fallback_to_character_chunking(): def test_run_custom_sentence_tokenizer_document_and_overlap_char_unit(): """Test that RecursiveDocumentSplitter works correctly with custom sentence tokenizer and overlap""" - splitter = RecursiveDocumentSplitter(split_length=25, split_overlap=5, separators=["sentence"]) + splitter = RecursiveDocumentSplitter(split_length=25, split_overlap=5, separators=["sentence"], split_unit="char") text = "This is sentence one. This is sentence two. This is sentence three." doc = Document(content=text) @@ -485,6 +492,10 @@ def test_run_split_by_word_count_page_breaks_word_unit(): doc_chunks = splitter.run([doc]) doc_chunks = doc_chunks["documents"] + for doc in doc_chunks: + print(doc.content) + print(doc.meta) + assert len(doc_chunks) == 4 assert doc_chunks[0].content == "This is some text." assert doc_chunks[0].meta["page_number"] == 1 @@ -546,9 +557,7 @@ def test_run_split_by_page_break_count_page_breaks_word_unit() -> None: def test_run_split_by_new_line_count_page_breaks_word_unit() -> None: - document_splitter = RecursiveDocumentSplitter( - separators=["\n"], split_length=21, split_overlap=0, split_unit="word" - ) + document_splitter = RecursiveDocumentSplitter(separators=["\n"], split_length=4, split_overlap=0, split_unit="word") text = ( "Sentence on page 1.\nAnother on page 1.\n\f" @@ -600,7 +609,7 @@ def test_run_split_by_new_line_count_page_breaks_word_unit() -> None: def test_run_split_by_sentence_count_page_breaks_word_unit() -> None: document_splitter = RecursiveDocumentSplitter( - separators=["sentence"], split_length=28, split_overlap=0, split_unit="word" + separators=["sentence"], split_length=7, split_overlap=0, split_unit="word" ) text = ( From 460cc7d1fae0648bb21357f45c0030ea90ab7650 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 19 Dec 2024 12:33:53 +0100 Subject: [PATCH 61/82] handing both default strategies --- .../preprocessors/recursive_splitter.py | 35 ++++++++----------- .../preprocessors/test_recursive_splitter.py | 10 ++++++ 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 8223294f5f..35e87ebdb6 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -237,36 +237,29 @@ def _chunk_text(self, text: str) -> List[str]: return chunks # if no separator worked, fall back to word- or character-level chunking - if self.split_units == "word": - return self.fall_back_to_word_level_chunking(text) - - return self.fall_back_to_char_level_chunking(text) + return self.fall_back_to_fixed_chunking(text, self.split_units) - def fall_back_to_word_level_chunking(self, text: str) -> List[str]: + def fall_back_to_fixed_chunking(self, text: str, split_units: Literal["word", "char"]) -> List[str]: """ - Fall back to word-level chunking if no separator works. + Fall back to a fixed chunking approach if no separator works for the text. :param text: The text to be split into chunks. + :param split_units: The unit of the split_length parameter. It can be either "word" or "char". :returns: A list of text chunks. """ - return [ - " ".join(text.split()[i : i + self.split_length]) - for i in range(0, self._chunk_length(text), self.split_length - self.split_overlap) - ] + chunks = [] + step = self.split_length - self.split_overlap - def fall_back_to_char_level_chunking(self, text: str) -> List[str]: - """ - Fall back to character-level chunking if no separator works. + if split_units == "word": + words = text.split() + for i in range(0, self._chunk_length(text), step): + chunks.append(" ".join(words[i : i + self.split_length])) + else: + for i in range(0, self._chunk_length(text), step): + chunks.append(text[i : i + self.split_length]) - :param text: The text to be split into chunks. - :returns: - A list of text chunks. - """ - return [ - text[i : i + self.split_length] - for i in range(0, self._chunk_length(text), self.split_length - self.split_overlap) - ] + return chunks def _add_overlap_info(self, curr_pos: int, new_doc: Document, new_docs: List[Document]) -> None: prev_doc = new_docs[-1] diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 4ac5582386..d73da66a9e 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -409,6 +409,16 @@ def test_run_fallback_to_character_chunking_by_default_length_too_short(): assert len(chunk.content) <= 2 +def test_run_fallback_to_word_chunking_by_default_length_too_short(): + text = "This is some text. This is some more text, and even more text." + separators = ["\n\n", "\n", "."] + splitter = RecursiveDocumentSplitter(split_length=2, separators=separators, split_unit="word") + doc = Document(content=text) + chunks = splitter.run([doc])["documents"] + for chunk in chunks: + assert splitter._chunk_length(chunk.content) <= 2 + + def test_run_custom_sentence_tokenizer_document_and_overlap_char_unit(): """Test that RecursiveDocumentSplitter works correctly with custom sentence tokenizer and overlap""" splitter = RecursiveDocumentSplitter(split_length=25, split_overlap=5, separators=["sentence"], split_unit="char") From 7901af50da6daf77597e3b91d6c9d5ffa4406b71 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 19 Dec 2024 16:52:24 +0100 Subject: [PATCH 62/82] wip --- .../components/preprocessors/recursive_splitter.py | 4 ++++ .../preprocessors/test_recursive_splitter.py | 11 ++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 35e87ebdb6..efd9d92cf1 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -154,6 +154,10 @@ def _chunk_length(self, text: str) -> int: The length of the chunk in words or characters. """ if self.split_units == "word": + print(text) + print(text.split()) + print(len(text.split())) + print("-----------------") return len(text.split()) else: return len(text) diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index d73da66a9e..69c1b014b5 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -52,7 +52,7 @@ def test_apply_overlap_with_overlap_capturing_completely_previous_chunk(caplog): chunks = ["chunk1", "chunk2", "chunk3", "chunk4"] _ = splitter._apply_overlap(chunks) assert ( - "Overlap is the same as the previous chunk. Consider increasing the `split_overlap` parameter or decreasing the `split_length` parameter." + "Overlap is the same as the previous chunk. Consider increasing the `split_length` parameter or decreasing the `split_overlap` parameter." in caplog.text ) @@ -183,7 +183,7 @@ def test_run_split_by_dot_count_page_breaks_split_unit_char() -> None: def test_run_split_by_word_count_page_breaks_split_unit_char(): - splitter = RecursiveDocumentSplitter(split_length=18, split_overlap=0, separators=["w"], split_unit="char") + splitter = RecursiveDocumentSplitter(split_length=18, split_overlap=0, separators=[" "], split_unit="char") text = "This is some text. \f This text is on another page. \f This is the last pag3." doc = Document(content=text) doc_chunks = splitter.run([doc]) @@ -496,15 +496,20 @@ def test_run_split_by_dot_count_page_breaks_word_unit() -> None: def test_run_split_by_word_count_page_breaks_word_unit(): - splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=0, separators=["w"], split_unit="word") + splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=0, separators=[" "], split_unit="word") text = "This is some text. \f This text is on another page. \f This is the last pag3." doc = Document(content=text) doc_chunks = splitter.run([doc]) doc_chunks = doc_chunks["documents"] + print("\n\n") + print("-------------") for doc in doc_chunks: print(doc.content) print(doc.meta) + print("-------------") + + exit(-1) assert len(doc_chunks) == 4 assert doc_chunks[0].content == "This is some text." From 8a09157d6fafd79455eb4893caa91232cf90271c Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 19 Dec 2024 18:46:25 +0100 Subject: [PATCH 63/82] \f was not being counted properly --- .../preprocessors/recursive_splitter.py | 13 +++++---- .../preprocessors/test_recursive_splitter.py | 27 +++++++------------ 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index efd9d92cf1..6e8c3f0ea9 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -154,10 +154,11 @@ def _chunk_length(self, text: str) -> int: The length of the chunk in words or characters. """ if self.split_units == "word": - print(text) - print(text.split()) - print(len(text.split())) - print("-----------------") + # page breaks are counted as a single word or page breaks followed by only whitespace 1 or multiple times + # regex that matches a page break followed by only whitespace 1 or multiple times + if re.match(r"\f\s*", text): + return 1 + return len(text.split()) else: return len(text) @@ -226,7 +227,9 @@ def _chunk_text(self, text: str) -> List[str]: break chunks.extend(self._chunk_text(split_text)) else: - chunks.append(split_text) + # chunks.append(split_text) + current_chunk.append(split_text) + current_length += self._chunk_length(split_text) else: current_chunk.append(split_text) current_length += self._chunk_length(split_text) diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 69c1b014b5..addfb8bb77 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -83,7 +83,7 @@ def test_chunk_text_by_period(): assert chunks[2] == " And one more." -def test_run_multiple_new_lines(): +def test_run_multiple_new_lines_unit_char(): splitter = RecursiveDocumentSplitter(split_length=20, separators=["\n\n", "\n"], split_unit="char") text = "This is a test.\n\n\nAnother test.\n\n\n\nFinal test." doc = Document(content=text) @@ -502,37 +502,28 @@ def test_run_split_by_word_count_page_breaks_word_unit(): doc_chunks = splitter.run([doc]) doc_chunks = doc_chunks["documents"] - print("\n\n") - print("-------------") - for doc in doc_chunks: - print(doc.content) - print(doc.meta) - print("-------------") - - exit(-1) - - assert len(doc_chunks) == 4 - assert doc_chunks[0].content == "This is some text." + assert len(doc_chunks) == 5 + assert doc_chunks[0].content == "This is some text. " assert doc_chunks[0].meta["page_number"] == 1 assert doc_chunks[0].meta["split_id"] == 0 assert doc_chunks[0].meta["split_idx_start"] == text.index(doc_chunks[0].content) - assert doc_chunks[1].content == " \f This text is on" + assert doc_chunks[1].content == "\f This text is " assert doc_chunks[1].meta["page_number"] == 2 assert doc_chunks[1].meta["split_id"] == 1 assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content) - assert doc_chunks[2].content == " another page. \f T" + assert doc_chunks[2].content == "on another page. \f " assert doc_chunks[2].meta["page_number"] == 3 assert doc_chunks[2].meta["split_id"] == 2 assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content) - assert doc_chunks[3].content == "his is the last pa" + assert doc_chunks[3].content == "This is the last " assert doc_chunks[3].meta["page_number"] == 3 assert doc_chunks[3].meta["split_id"] == 3 assert doc_chunks[3].meta["split_idx_start"] == text.index(doc_chunks[3].content) - assert doc_chunks[4].content == "g3." + assert doc_chunks[4].content == "pag3." assert doc_chunks[4].meta["page_number"] == 3 assert doc_chunks[4].meta["split_id"] == 4 assert doc_chunks[4].meta["split_idx_start"] == text.index(doc_chunks[4].content) @@ -560,12 +551,12 @@ def test_run_split_by_page_break_count_page_breaks_word_unit() -> None: assert chunks_docs[1].meta["split_id"] == 1 assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content) - assert chunks_docs[2].content == "Sentence on page 3. Another on page 3.\f\f" + assert chunks_docs[2].content == "Sentence on page 3. Another on page 3.\f" assert chunks_docs[2].meta["page_number"] == 3 assert chunks_docs[2].meta["split_id"] == 2 assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content) - assert chunks_docs[3].content == " Sentence on page 5." + assert chunks_docs[3].content == "\f Sentence on page 5." assert chunks_docs[3].meta["page_number"] == 5 assert chunks_docs[3].meta["split_id"] == 3 assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content) From 3ad73a537d28715929afeed8eda1fadc01d8e6fd Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 20 Dec 2024 12:39:50 +0100 Subject: [PATCH 64/82] updating tests --- .../preprocessors/recursive_splitter.py | 1 - .../preprocessors/test_recursive_splitter.py | 18 +++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 6e8c3f0ea9..a1fdd991a0 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -227,7 +227,6 @@ def _chunk_text(self, text: str) -> List[str]: break chunks.extend(self._chunk_text(split_text)) else: - # chunks.append(split_text) current_chunk.append(split_text) current_length += self._chunk_length(split_text) else: diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index addfb8bb77..bc0049b6a5 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -84,13 +84,13 @@ def test_chunk_text_by_period(): def test_run_multiple_new_lines_unit_char(): - splitter = RecursiveDocumentSplitter(split_length=20, separators=["\n\n", "\n"], split_unit="char") + splitter = RecursiveDocumentSplitter(split_length=18, separators=["\n\n", "\n"], split_unit="char") text = "This is a test.\n\n\nAnother test.\n\n\n\nFinal test." doc = Document(content=text) chunks = splitter.run([doc])["documents"] assert chunks[0].content == "This is a test.\n\n" - assert chunks[1].content == "\nAnother test.\n\n" - assert chunks[2].content == "\n\nFinal test." + assert chunks[1].content == "\nAnother test.\n\n\n\n" + assert chunks[2].content == "Final test." def test_run_empty_documents(caplog: LogCaptureFixture): @@ -183,34 +183,34 @@ def test_run_split_by_dot_count_page_breaks_split_unit_char() -> None: def test_run_split_by_word_count_page_breaks_split_unit_char(): - splitter = RecursiveDocumentSplitter(split_length=18, split_overlap=0, separators=[" "], split_unit="char") + splitter = RecursiveDocumentSplitter(split_length=19, split_overlap=0, separators=[" "], split_unit="char") text = "This is some text. \f This text is on another page. \f This is the last pag3." doc = Document(content=text) doc_chunks = splitter.run([doc]) doc_chunks = doc_chunks["documents"] assert len(doc_chunks) == 5 - assert doc_chunks[0].content == "This is some text." + assert doc_chunks[0].content == "This is some text. " assert doc_chunks[0].meta["page_number"] == 1 assert doc_chunks[0].meta["split_id"] == 0 assert doc_chunks[0].meta["split_idx_start"] == text.index(doc_chunks[0].content) - assert doc_chunks[1].content == " \f This text is on" + assert doc_chunks[1].content == "\f This text is on " assert doc_chunks[1].meta["page_number"] == 2 assert doc_chunks[1].meta["split_id"] == 1 assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content) - assert doc_chunks[2].content == " another page. \f T" + assert doc_chunks[2].content == "another page. \f " assert doc_chunks[2].meta["page_number"] == 3 assert doc_chunks[2].meta["split_id"] == 2 assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content) - assert doc_chunks[3].content == "his is the last pa" + assert doc_chunks[3].content == "This is the last " assert doc_chunks[3].meta["page_number"] == 3 assert doc_chunks[3].meta["split_id"] == 3 assert doc_chunks[3].meta["split_idx_start"] == text.index(doc_chunks[3].content) - assert doc_chunks[4].content == "g3." + assert doc_chunks[4].content == "pag3." assert doc_chunks[4].meta["page_number"] == 3 assert doc_chunks[4].meta["split_id"] == 4 assert doc_chunks[4].meta["split_idx_start"] == text.index(doc_chunks[4].content) From b09154ec8fcea9cd668ad89ca0ed217308b3e49b Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 20 Dec 2024 19:41:26 +0100 Subject: [PATCH 65/82] fixing the overlap bug --- .../preprocessors/recursive_splitter.py | 93 ++++++++++++++++--- .../preprocessors/test_recursive_splitter.py | 75 +++++++++------ 2 files changed, 127 insertions(+), 41 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index a1fdd991a0..0caf63ab29 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -118,33 +118,100 @@ def _get_custom_sentence_tokenizer(sentence_splitter_params: Dict[str, Any]): def _apply_overlap(self, chunks: List[str]) -> List[str]: """ Applies an overlap between consecutive chunks if the chunk_overlap attribute is greater than zero. - - :param chunks: List of text chunks. - :returns: - The list of chunks with overlap applied. """ - overlapped_chunks = [] + overlapped_chunks: List[str] = [] + remaining_words: List[str] = [] + remaining_chars: str = "" for idx, chunk in enumerate(chunks): if idx == 0: overlapped_chunks.append(chunk) continue - overlap_start = max(0, self._chunk_length(chunks[idx - 1]) - self.split_overlap) - if self.split_units == "word": - word_chunks = chunks[idx - 1].split() - overlap = " ".join(word_chunks[overlap_start:]) - else: - overlap = chunks[idx - 1][overlap_start:] - if overlap == chunks[idx - 1]: + + overlap, prev_chunk = self._get_overlap(overlapped_chunks) + + if overlap == prev_chunk: logger.warning( "Overlap is the same as the previous chunk. " "Consider increasing the `split_length` parameter or decreasing the `split_overlap` parameter." ) - current_chunk = overlap + chunk + + # create new chunk starting with the overlap + if self.split_units == "word": + current_chunk = overlap + " " + chunk + else: + current_chunk = overlap + chunk + + # if the new chunk exceeds split_length, trim it and add the trimmed content to the next chunk + if self._chunk_length(current_chunk) > self.split_length: + if self.split_units == "word": + words = current_chunk.split() + current_chunk = " ".join(words[: self.split_length]) + remaining_words = words[self.split_length :] + if idx < len(chunks) - 1: + # add remaining words to the beginning of the next chunk + chunks[idx + 1] = " ".join(remaining_words) + " " + chunks[idx + 1] + elif remaining_words: + # if this is the last chunk, and we have remaining words + overlapped_chunks.append(current_chunk) + current_chunk = " ".join(remaining_words) + + else: # char-level splitting + text = current_chunk + current_chunk = text[: self.split_length] + remaining_chars = text[self.split_length :] + if idx < len(chunks) - 1: + # add remaining chars to the beginning of the next chunk + chunks[idx + 1] = remaining_chars + chunks[idx + 1] + elif remaining_chars: # if this is the last chunk and we have remaining chars + overlapped_chunks.append(current_chunk) + current_chunk = remaining_chars + + # if this is the last chunk, and we have remaining words or characters, add them to the current chunk + if idx == len(chunks) - 1 and (remaining_words or remaining_chars): + overlap, prev_chunk = self._get_overlap(overlapped_chunks) + if remaining_words: + current_chunk = overlap + " " + current_chunk + if remaining_chars: + current_chunk = overlap + current_chunk + overlapped_chunks.append(current_chunk) + # check if the last chunk exceeds split_length and split it + if idx == len(chunks) - 1 and self._chunk_length(current_chunk) > self.split_length: + # split the last chunk and add the first chunk to the list + last_chunk = overlapped_chunks.pop() + if self.split_units == "word": + words = last_chunk.split() + first_chunk = " ".join(words[: self.split_length]) + remaining_chunk = " ".join(words[self.split_length :]) + else: + first_chunk = last_chunk[: self.split_length] + remaining_chunk = last_chunk[self.split_length :] + overlapped_chunks.append(first_chunk) + + # add the remaining chunk with overlap from the previous chunk + if remaining_chunk: + overlap, prev_chunk = self._get_overlap(overlapped_chunks) + if self.split_units == "word": + remaining_chunk = overlap + " " + remaining_chunk + else: + remaining_chunk = overlap + remaining_chunk + overlapped_chunks.append(remaining_chunk) + return overlapped_chunks + def _get_overlap(self, overlapped_chunks): + """Get the previous overlapped chunk instead of the original chunk.""" + prev_chunk = overlapped_chunks[-1] + overlap_start = max(0, self._chunk_length(prev_chunk) - self.split_overlap) + if self.split_units == "word": + word_chunks = prev_chunk.split() + overlap = " ".join(word_chunks[overlap_start:]) + else: + overlap = prev_chunk[overlap_start:] + return overlap, prev_chunk + def _chunk_length(self, text: str) -> int: """ Get the length of the chunk in words or characters. diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index bc0049b6a5..0413d5538f 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -354,40 +354,47 @@ def test_run_split_by_sentence_count_page_breaks_split_unit_char() -> None: def test_run_split_document_with_overlap_character_unit(): - splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=11, separators=[".", " "], split_unit="char") - text = """A simple sentence1. A bright sentence2. A clever sentence3. A joyful sentence4""" + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=10, separators=["."], split_unit="char") + text = """A simple sentence1. A bright sentence2. A clever sentence3""" doc = Document(content=text) doc_chunks = splitter.run([doc]) doc_chunks = doc_chunks["documents"] - assert len(doc_chunks) == 4 - + assert len(doc_chunks) == 5 assert doc_chunks[0].content == "A simple sentence1." assert doc_chunks[0].meta["split_id"] == 0 assert doc_chunks[0].meta["split_idx_start"] == text.index(doc_chunks[0].content) - assert doc_chunks[0].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (0, 11)}] + assert doc_chunks[0].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (0, 10)}] - assert doc_chunks[1].content == " sentence1. A bright sentence2." + assert doc_chunks[1].content == "sentence1. A bright " assert doc_chunks[1].meta["split_id"] == 1 assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content) assert doc_chunks[1].meta["_split_overlap"] == [ - {"doc_id": doc_chunks[0].id, "range": (8, 19)}, - {"doc_id": doc_chunks[2].id, "range": (0, 11)}, + {"doc_id": doc_chunks[0].id, "range": (9, 19)}, + {"doc_id": doc_chunks[2].id, "range": (0, 10)}, ] - assert doc_chunks[2].content == " sentence2. A clever sentence3." + assert doc_chunks[2].content == " A bright sentence2." assert doc_chunks[2].meta["split_id"] == 2 assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content) assert doc_chunks[2].meta["_split_overlap"] == [ - {"doc_id": doc_chunks[1].id, "range": (20, 31)}, - {"doc_id": doc_chunks[3].id, "range": (0, 11)}, + {"doc_id": doc_chunks[1].id, "range": (10, 20)}, + {"doc_id": doc_chunks[3].id, "range": (0, 10)}, ] - assert doc_chunks[3].content == " sentence3. A joyful sentence4" + assert doc_chunks[3].content == "sentence2. A clever " assert doc_chunks[3].meta["split_id"] == 3 assert doc_chunks[3].meta["split_idx_start"] == text.index(doc_chunks[3].content) - assert doc_chunks[3].meta["_split_overlap"] == [{"doc_id": doc_chunks[2].id, "range": (20, 31)}] + assert doc_chunks[3].meta["_split_overlap"] == [ + {"doc_id": doc_chunks[2].id, "range": (10, 20)}, + {"doc_id": doc_chunks[4].id, "range": (0, 10)}, + ] + + assert doc_chunks[4].content == " A clever sentence3" + assert doc_chunks[4].meta["split_id"] == 4 + assert doc_chunks[4].meta["split_idx_start"] == text.index(doc_chunks[4].content) + assert doc_chunks[4].meta["_split_overlap"] == [{"doc_id": doc_chunks[3].id, "range": (10, 20)}] def test_run_separator_exists_but_split_length_too_small_fall_back_to_character_chunking(): @@ -421,31 +428,38 @@ def test_run_fallback_to_word_chunking_by_default_length_too_short(): def test_run_custom_sentence_tokenizer_document_and_overlap_char_unit(): """Test that RecursiveDocumentSplitter works correctly with custom sentence tokenizer and overlap""" - splitter = RecursiveDocumentSplitter(split_length=25, split_overlap=5, separators=["sentence"], split_unit="char") + splitter = RecursiveDocumentSplitter(split_length=25, split_overlap=10, separators=["sentence"], split_unit="char") text = "This is sentence one. This is sentence two. This is sentence three." doc = Document(content=text) doc_chunks = splitter.run([doc])["documents"] - assert len(doc_chunks) == 3 - + assert len(doc_chunks) == 4 assert doc_chunks[0].content == "This is sentence one. " assert doc_chunks[0].meta["split_id"] == 0 assert doc_chunks[0].meta["split_idx_start"] == text.index(doc_chunks[0].content) - assert doc_chunks[0].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (0, 5)}] + assert doc_chunks[0].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (0, 10)}] - assert doc_chunks[1].content == "one. This is sentence two. " + assert doc_chunks[1].content == "ence one. This is sentenc" assert doc_chunks[1].meta["split_id"] == 1 assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content) assert doc_chunks[1].meta["_split_overlap"] == [ - {"doc_id": doc_chunks[0].id, "range": (17, 22)}, - {"doc_id": doc_chunks[2].id, "range": (0, 5)}, + {"doc_id": doc_chunks[0].id, "range": (12, 22)}, + {"doc_id": doc_chunks[2].id, "range": (0, 10)}, ] - assert doc_chunks[2].content == "two. This is sentence three." + assert doc_chunks[2].content == "is sentence two. This is " assert doc_chunks[2].meta["split_id"] == 2 assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content) - assert doc_chunks[2].meta["_split_overlap"] == [{"doc_id": doc_chunks[1].id, "range": (22, 27)}] + assert doc_chunks[2].meta["_split_overlap"] == [ + {"doc_id": doc_chunks[1].id, "range": (15, 25)}, + {"doc_id": doc_chunks[3].id, "range": (0, 10)}, + ] + + assert doc_chunks[3].content == ". This is sentence three." + assert doc_chunks[3].meta["split_id"] == 3 + assert doc_chunks[3].meta["split_idx_start"] == text.index(doc_chunks[3].content) + assert doc_chunks[3].meta["_split_overlap"] == [{"doc_id": doc_chunks[2].id, "range": (15, 25)}] def test_run_split_by_dot_count_page_breaks_word_unit() -> None: @@ -673,15 +687,20 @@ def test_run_custom_sentence_tokenizer_document_and_overlap_word_unit_no_overlap assert chunks[2].content == " This is sentence three." -def test_run_custom_sentence_tokenizer_document_and_overlap_word_unit_overlap_2_words(): - splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=2, separators=["."], split_unit="word") +def test_run_custom_split_by_dot_and_overlap_1_word_unit(): + splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=1, separators=["."], split_unit="word") text = "This is sentence one. This is sentence two. This is sentence three. This is sentence four." chunks = splitter.run([Document(content=text)])["documents"] - assert len(chunks) == 4 + assert len(chunks) == 5 assert chunks[0].content == "This is sentence one." - assert chunks[1].content == "sentence one. This is sentence two." - assert chunks[2].content == "sentence two. This is sentence three." - assert chunks[3].content == "sentence three. This is sentence four." + assert chunks[1].content == "one. This is sentence" + assert chunks[2].content == "sentence two. This is" + assert chunks[3].content == "is sentence three. This" + assert chunks[4].content == "This is sentence four." + + +def test_run_custom_split_by_dot_and_overlap_3_char_unit(): + pass def test_run_serialization_in_pipeline(): From c1fa6c2103eeb01364192858999b795ce487f9b4 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Sat, 21 Dec 2024 11:25:42 +0100 Subject: [PATCH 66/82] adding more tests --- .../preprocessors/recursive_splitter.py | 64 ++++++++----- .../preprocessors/test_recursive_splitter.py | 94 ++++++++++++++++++- 2 files changed, 133 insertions(+), 25 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 0caf63ab29..d04d33aedd 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -4,7 +4,7 @@ import re from copy import deepcopy -from typing import Any, Dict, List, Literal, Optional +from typing import Any, Dict, List, Literal, Optional, Tuple, Union from haystack import Document, component, logging @@ -115,9 +115,27 @@ def _get_custom_sentence_tokenizer(sentence_splitter_params: Dict[str, Any]): return SentenceSplitter(**sentence_splitter_params) + def _split_chunk(self, current_chunk: str) -> Union[Tuple[str, List[str]], Tuple[str, str]]: + if self.split_units == "word": + words = current_chunk.split() + current_chunk = " ".join(words[: self.split_length]) + remaining_words = words[self.split_length :] + return current_chunk, remaining_words + + text = current_chunk + current_chunk = text[: self.split_length] + remaining_chars = text[self.split_length :] + return current_chunk, remaining_chars + def _apply_overlap(self, chunks: List[str]) -> List[str]: """ Applies an overlap between consecutive chunks if the chunk_overlap attribute is greater than zero. + + Works for both word- and character-level splitting. It trims the last chunk if it exceeds the split_length and + adds the trimmed content to the next chunk. If the last chunk is still too long after trimming, it splits it + and adds the first chunk to the list. This process continues until the last chunk is within the split_length. + + :param chunks: A list of text chunks. """ overlapped_chunks: List[str] = [] remaining_words: List[str] = [] @@ -129,7 +147,6 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: continue overlap, prev_chunk = self._get_overlap(overlapped_chunks) - if overlap == prev_chunk: logger.warning( "Overlap is the same as the previous chunk. " @@ -137,17 +154,12 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: ) # create new chunk starting with the overlap - if self.split_units == "word": - current_chunk = overlap + " " + chunk - else: - current_chunk = overlap + chunk + current_chunk = overlap + " " + chunk if self.split_units == "word" else overlap + chunk # if the new chunk exceeds split_length, trim it and add the trimmed content to the next chunk if self._chunk_length(current_chunk) > self.split_length: if self.split_units == "word": - words = current_chunk.split() - current_chunk = " ".join(words[: self.split_length]) - remaining_words = words[self.split_length :] + current_chunk, remaining_words = self._split_chunk(current_chunk) # type: ignore if idx < len(chunks) - 1: # add remaining words to the beginning of the next chunk chunks[idx + 1] = " ".join(remaining_words) + " " + chunks[idx + 1] @@ -157,9 +169,7 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: current_chunk = " ".join(remaining_words) else: # char-level splitting - text = current_chunk - current_chunk = text[: self.split_length] - remaining_chars = text[self.split_length :] + current_chunk, remaining_chars = self._split_chunk(current_chunk) # type: ignore if idx < len(chunks) - 1: # add remaining chars to the beginning of the next chunk chunks[idx + 1] = remaining_chars + chunks[idx + 1] @@ -170,16 +180,12 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: # if this is the last chunk, and we have remaining words or characters, add them to the current chunk if idx == len(chunks) - 1 and (remaining_words or remaining_chars): overlap, prev_chunk = self._get_overlap(overlapped_chunks) - if remaining_words: - current_chunk = overlap + " " + current_chunk - if remaining_chars: - current_chunk = overlap + current_chunk + current_chunk = overlap + " " + current_chunk if remaining_words else overlap + current_chunk overlapped_chunks.append(current_chunk) - # check if the last chunk exceeds split_length and split it + # new approach to split the last chunk if idx == len(chunks) - 1 and self._chunk_length(current_chunk) > self.split_length: - # split the last chunk and add the first chunk to the list last_chunk = overlapped_chunks.pop() if self.split_units == "word": words = last_chunk.split() @@ -188,16 +194,28 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: else: first_chunk = last_chunk[: self.split_length] remaining_chunk = last_chunk[self.split_length :] + overlapped_chunks.append(first_chunk) - # add the remaining chunk with overlap from the previous chunk - if remaining_chunk: + while remaining_chunk: overlap, prev_chunk = self._get_overlap(overlapped_chunks) if self.split_units == "word": - remaining_chunk = overlap + " " + remaining_chunk + current = overlap + " " + remaining_chunk + words = current.split() + if len(words) <= self.split_length: + overlapped_chunks.append(current) + break + first_chunk = " ".join(words[: self.split_length]) + remaining_chunk = " ".join(words[self.split_length :]) else: - remaining_chunk = overlap + remaining_chunk - overlapped_chunks.append(remaining_chunk) + current = overlap + remaining_chunk + if len(current) <= self.split_length: + overlapped_chunks.append(current) + break + first_chunk = current[: self.split_length] + remaining_chunk = current[self.split_length :] + + overlapped_chunks.append(first_chunk) return overlapped_chunks diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 0413d5538f..8f2224219c 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -677,7 +677,7 @@ def test_run_split_by_sentence_count_page_breaks_word_unit() -> None: assert chunks_docs[6].meta["split_idx_start"] == text.index(chunks_docs[6].content) -def test_run_custom_sentence_tokenizer_document_and_overlap_word_unit_no_overlap(): +def test_run_split_by_sentence_tokenizer_document_and_overlap_word_unit_no_overlap(): splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=0, separators=["."], split_unit="word") text = "This is sentence one. This is sentence two. This is sentence three." chunks = splitter.run([Document(content=text)])["documents"] @@ -687,7 +687,7 @@ def test_run_custom_sentence_tokenizer_document_and_overlap_word_unit_no_overlap assert chunks[2].content == " This is sentence three." -def test_run_custom_split_by_dot_and_overlap_1_word_unit(): +def test_run_split_by_dot_and_overlap_1_word_unit(): splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=1, separators=["."], split_unit="word") text = "This is sentence one. This is sentence two. This is sentence three. This is sentence four." chunks = splitter.run([Document(content=text)])["documents"] @@ -699,6 +699,96 @@ def test_run_custom_split_by_dot_and_overlap_1_word_unit(): assert chunks[4].content == "This is sentence four." +def test_run_trigger_dealing_with_remaining_word_larger_than_split_length(): + splitter = RecursiveDocumentSplitter(split_length=3, split_overlap=2, separators=["."], split_unit="word") + text = """A simple sentence1. A bright sentence2. A clever sentence3""" + doc = Document(content=text) + chunks = splitter.run([doc])["documents"] + assert len(chunks) == 7 + assert chunks[0].content == "A simple sentence1." + assert chunks[1].content == "simple sentence1. A" + assert chunks[2].content == "sentence1. A bright" + assert chunks[3].content == "A bright sentence2." + assert chunks[4].content == "bright sentence2. A" + assert chunks[5].content == "sentence2. A clever" + assert chunks[6].content == "A clever sentence3" + + +def test_run_trigger_dealing_with_remaining_char_larger_than_split_length(): + splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=15, separators=["."], split_unit="char") + text = """A simple sentence1. A bright sentence2. A clever sentence3""" + doc = Document(content=text) + chunks = splitter.run([doc])["documents"] + + assert len(chunks) == 9 + + assert chunks[0].content == "A simple sentence1." + assert chunks[0].meta["split_id"] == 0 + assert chunks[0].meta["split_idx_start"] == text.index(chunks[0].content) + assert chunks[0].meta["_split_overlap"] == [{"doc_id": chunks[1].id, "range": (0, 15)}] + + assert chunks[1].content == "mple sentence1. A br" + assert chunks[1].meta["split_id"] == 1 + assert chunks[1].meta["split_idx_start"] == text.index(chunks[1].content) + assert chunks[1].meta["_split_overlap"] == [ + {"doc_id": chunks[0].id, "range": (4, 19)}, + {"doc_id": chunks[2].id, "range": (0, 15)}, + ] + + assert chunks[2].content == "sentence1. A bright " + assert chunks[2].meta["split_id"] == 2 + assert chunks[2].meta["split_idx_start"] == text.index(chunks[2].content) + assert chunks[2].meta["_split_overlap"] == [ + {"doc_id": chunks[1].id, "range": (5, 20)}, + {"doc_id": chunks[3].id, "range": (0, 15)}, + ] + + assert chunks[3].content == "nce1. A bright sente" + assert chunks[3].meta["split_id"] == 3 + assert chunks[3].meta["split_idx_start"] == text.index(chunks[3].content) + assert chunks[3].meta["_split_overlap"] == [ + {"doc_id": chunks[2].id, "range": (5, 20)}, + {"doc_id": chunks[4].id, "range": (0, 15)}, + ] + + assert chunks[4].content == " A bright sentence2." + assert chunks[4].meta["split_id"] == 4 + assert chunks[4].meta["split_idx_start"] == text.index(chunks[4].content) + assert chunks[4].meta["_split_overlap"] == [ + {"doc_id": chunks[3].id, "range": (5, 20)}, + {"doc_id": chunks[5].id, "range": (0, 15)}, + ] + + assert chunks[5].content == "ight sentence2. A cl" + assert chunks[5].meta["split_id"] == 5 + assert chunks[5].meta["split_idx_start"] == text.index(chunks[5].content) + assert chunks[5].meta["_split_overlap"] == [ + {"doc_id": chunks[4].id, "range": (5, 20)}, + {"doc_id": chunks[6].id, "range": (0, 15)}, + ] + + assert chunks[6].content == "sentence2. A clever " + assert chunks[6].meta["split_id"] == 6 + assert chunks[6].meta["split_idx_start"] == text.index(chunks[6].content) + assert chunks[6].meta["_split_overlap"] == [ + {"doc_id": chunks[5].id, "range": (5, 20)}, + {"doc_id": chunks[7].id, "range": (0, 15)}, + ] + + assert chunks[7].content == "nce2. A clever sente" + assert chunks[7].meta["split_id"] == 7 + assert chunks[7].meta["split_idx_start"] == text.index(chunks[7].content) + assert chunks[7].meta["_split_overlap"] == [ + {"doc_id": chunks[6].id, "range": (5, 20)}, + {"doc_id": chunks[8].id, "range": (0, 15)}, + ] + + assert chunks[8].content == " A clever sentence3" + assert chunks[8].meta["split_id"] == 8 + assert chunks[8].meta["split_idx_start"] == text.index(chunks[8].content) + assert chunks[8].meta["_split_overlap"] == [{"doc_id": chunks[7].id, "range": (5, 20)}] + + def test_run_custom_split_by_dot_and_overlap_3_char_unit(): pass From de5e951589416182890a5c060238948e2f44580f Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Sat, 21 Dec 2024 12:11:17 +0100 Subject: [PATCH 67/82] refactoring _apply_overlap --- .../preprocessors/recursive_splitter.py | 57 ++++++++++--------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index d04d33aedd..633cc7b257 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -4,7 +4,7 @@ import re from copy import deepcopy -from typing import Any, Dict, List, Literal, Optional, Tuple, Union +from typing import Any, Dict, List, Literal, Optional, Tuple from haystack import Document, component, logging @@ -115,13 +115,21 @@ def _get_custom_sentence_tokenizer(sentence_splitter_params: Dict[str, Any]): return SentenceSplitter(**sentence_splitter_params) - def _split_chunk(self, current_chunk: str) -> Union[Tuple[str, List[str]], Tuple[str, str]]: + def _split_chunk(self, current_chunk: str) -> Tuple[str, str]: + """ + Splits a chunk based on the split_length and split_units attribute. + + :param current_chunk: The current chunk to be split. + :returns: + A tuple containing the current chunk and the remaining words or characters. + """ + if self.split_units == "word": words = current_chunk.split() current_chunk = " ".join(words[: self.split_length]) remaining_words = words[self.split_length :] - return current_chunk, remaining_words - + return current_chunk, " ".join(remaining_words) + # split by characters text = current_chunk current_chunk = text[: self.split_length] remaining_chars = text[self.split_length :] @@ -136,10 +144,11 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: and adds the first chunk to the list. This process continues until the last chunk is within the split_length. :param chunks: A list of text chunks. + :returns: + A list of text chunks with the overlap applied. """ overlapped_chunks: List[str] = [] - remaining_words: List[str] = [] - remaining_chars: str = "" + remaining_text: str = "" for idx, chunk in enumerate(chunks): if idx == 0: @@ -157,30 +166,22 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: current_chunk = overlap + " " + chunk if self.split_units == "word" else overlap + chunk # if the new chunk exceeds split_length, trim it and add the trimmed content to the next chunk + # if we are at the last chunk the next new chunk contains the remaining text if self._chunk_length(current_chunk) > self.split_length: - if self.split_units == "word": - current_chunk, remaining_words = self._split_chunk(current_chunk) # type: ignore - if idx < len(chunks) - 1: - # add remaining words to the beginning of the next chunk - chunks[idx + 1] = " ".join(remaining_words) + " " + chunks[idx + 1] - elif remaining_words: - # if this is the last chunk, and we have remaining words - overlapped_chunks.append(current_chunk) - current_chunk = " ".join(remaining_words) - - else: # char-level splitting - current_chunk, remaining_chars = self._split_chunk(current_chunk) # type: ignore - if idx < len(chunks) - 1: - # add remaining chars to the beginning of the next chunk - chunks[idx + 1] = remaining_chars + chunks[idx + 1] - elif remaining_chars: # if this is the last chunk and we have remaining chars - overlapped_chunks.append(current_chunk) - current_chunk = remaining_chars - - # if this is the last chunk, and we have remaining words or characters, add them to the current chunk - if idx == len(chunks) - 1 and (remaining_words or remaining_chars): + current_chunk, remaining_text = self._split_chunk(current_chunk) + if idx < len(chunks) - 1: + chunks[idx + 1] = remaining_text + (" " if self.split_units == "word" else "") + chunks[idx + 1] + elif remaining_text: + overlapped_chunks.append(current_chunk) + current_chunk = remaining_text + + # if this is the last chunk, and we have remaining text add them to the current chunk + if idx == len(chunks) - 1 and remaining_text: overlap, prev_chunk = self._get_overlap(overlapped_chunks) - current_chunk = overlap + " " + current_chunk if remaining_words else overlap + current_chunk + if self.split_units == "word": + current_chunk = overlap + " " + current_chunk + if self.split_units == "char": + current_chunk = overlap + current_chunk overlapped_chunks.append(current_chunk) From 81c7c89caf46501650c5802e4d89a28201f15a66 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Sat, 21 Dec 2024 13:12:08 +0100 Subject: [PATCH 68/82] further refactoring --- .../preprocessors/recursive_splitter.py | 60 +++++++------------ 1 file changed, 22 insertions(+), 38 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 633cc7b257..33d3940458 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -148,13 +148,13 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: A list of text chunks with the overlap applied. """ overlapped_chunks: List[str] = [] - remaining_text: str = "" for idx, chunk in enumerate(chunks): if idx == 0: overlapped_chunks.append(chunk) continue + # get the overlap between the current and previous chunk overlap, prev_chunk = self._get_overlap(overlapped_chunks) if overlap == prev_chunk: logger.warning( @@ -162,60 +162,44 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: "Consider increasing the `split_length` parameter or decreasing the `split_overlap` parameter." ) - # create new chunk starting with the overlap + # create a new chunk starting with the overlap current_chunk = overlap + " " + chunk if self.split_units == "word" else overlap + chunk - # if the new chunk exceeds split_length, trim it and add the trimmed content to the next chunk - # if we are at the last chunk the next new chunk contains the remaining text + # if this new chunk exceeds 'split_length', trim it and move the remaining text to the next chunk + # if this is the last chunk, another new chunk will contain the trimmed text preceded by the overlap + # of the last chunk if self._chunk_length(current_chunk) > self.split_length: current_chunk, remaining_text = self._split_chunk(current_chunk) if idx < len(chunks) - 1: chunks[idx + 1] = remaining_text + (" " if self.split_units == "word" else "") + chunks[idx + 1] elif remaining_text: + # create a new chunk with the trimmed text preceded by the overlap of the last chunk overlapped_chunks.append(current_chunk) - current_chunk = remaining_text - - # if this is the last chunk, and we have remaining text add them to the current chunk - if idx == len(chunks) - 1 and remaining_text: - overlap, prev_chunk = self._get_overlap(overlapped_chunks) - if self.split_units == "word": - current_chunk = overlap + " " + current_chunk - if self.split_units == "char": - current_chunk = overlap + current_chunk + chunk = remaining_text + overlap, _ = self._get_overlap(overlapped_chunks) + current_chunk = overlap + " " + chunk if self.split_units == "word" else overlap + chunk overlapped_chunks.append(current_chunk) - # new approach to split the last chunk + # it can still be that the new last chunk exceeds the 'split_length' + # continue splitting until the last chunk is within 'split_length' if idx == len(chunks) - 1 and self._chunk_length(current_chunk) > self.split_length: last_chunk = overlapped_chunks.pop() - if self.split_units == "word": - words = last_chunk.split() - first_chunk = " ".join(words[: self.split_length]) - remaining_chunk = " ".join(words[self.split_length :]) - else: - first_chunk = last_chunk[: self.split_length] - remaining_chunk = last_chunk[self.split_length :] - + first_chunk, remaining_chunk = self._split_chunk(last_chunk) overlapped_chunks.append(first_chunk) while remaining_chunk: - overlap, prev_chunk = self._get_overlap(overlapped_chunks) - if self.split_units == "word": - current = overlap + " " + remaining_chunk - words = current.split() - if len(words) <= self.split_length: - overlapped_chunks.append(current) - break - first_chunk = " ".join(words[: self.split_length]) - remaining_chunk = " ".join(words[self.split_length :]) - else: - current = overlap + remaining_chunk - if len(current) <= self.split_length: - overlapped_chunks.append(current) - break - first_chunk = current[: self.split_length] - remaining_chunk = current[self.split_length :] + # combine overlap with remaining chunk + overlap, _ = self._get_overlap(overlapped_chunks) + current = overlap + (" " if self.split_units == "word" else "") + remaining_chunk + + # if it fits within split_length we are done + if self._chunk_length(current) <= self.split_length: + overlapped_chunks.append(current) + break + # otherwise split it again + first_chunk, remaining_chunk = self._split_chunk(current) overlapped_chunks.append(first_chunk) return overlapped_chunks From 50ac7afba317918cacbb16516d60c092a7589e0f Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 8 Jan 2025 11:44:09 +0100 Subject: [PATCH 69/82] Update haystack/components/preprocessors/recursive_splitter.py Co-authored-by: Sebastian Husch Lee --- haystack/components/preprocessors/recursive_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 33d3940458..65cd7c17e8 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -63,7 +63,7 @@ def __init__( """ Initializes a RecursiveDocumentSplitter. - :param split_length: The maximum length of each chunk by default in characters, but can be in words. + :param split_length: The maximum length of each chunk by default in words, but can be in characters. See the `split_units` parameter. :param split_overlap: The number of characters to overlap between consecutive chunks. :param split_unit: The unit of the split_length parameter. It can be either "word" or "char". From 602ac9bd6a5e1c2c1d01639e46ec723dcb8dc61d Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 8 Jan 2025 12:15:39 +0100 Subject: [PATCH 70/82] Update haystack/components/preprocessors/recursive_splitter.py Co-authored-by: Sebastian Husch Lee --- haystack/components/preprocessors/recursive_splitter.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 65cd7c17e8..7d26537a89 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -85,8 +85,11 @@ def __init__( self.sentence_tokenizer_params = sentence_splitter_params self._check_params() self.nltk_tokenizer = None - if "sentence" in self.separators: - self.warm_up(sentence_splitter_params) + self.sentence_splitter_params = sentence_splitter_params + if self.sentence_splitter_params is None: + self.sentence_splitter_params = {"keep_white_spaces": True} + else: + self.sentence_splitter_params["keep_white_spaces"] = True def warm_up(self, sentence_splitter_params): """ From 78ebc710158e2db9ad13ebe6d5e49966e388fcc6 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 8 Jan 2025 12:16:08 +0100 Subject: [PATCH 71/82] Update haystack/components/preprocessors/recursive_splitter.py Co-authored-by: Sebastian Husch Lee --- haystack/components/preprocessors/recursive_splitter.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 7d26537a89..c9e11af477 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -91,15 +91,10 @@ def __init__( else: self.sentence_splitter_params["keep_white_spaces"] = True - def warm_up(self, sentence_splitter_params): + def warm_up(self): """ Warm up the sentence tokenizer. - - :param sentence_splitter_params: Optional parameters to pass to the sentence tokenizer. - :returns: - An instance of the SentenceSplitter. """ - sentence_splitter_params = sentence_splitter_params or {"keep_white_spaces": True} self.nltk_tokenizer = self._get_custom_sentence_tokenizer(sentence_splitter_params) def _check_params(self): From a6a2475839036959a8ddca2c48e194b628fb8a3c Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 8 Jan 2025 12:17:16 +0100 Subject: [PATCH 72/82] Update haystack/components/preprocessors/recursive_splitter.py Co-authored-by: Sebastian Husch Lee --- haystack/components/preprocessors/recursive_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index c9e11af477..8736a8ba36 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -39,7 +39,7 @@ class RecursiveDocumentSplitter: AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games.''' - + chunker.warm_up() doc = Document(content=text) doc_chunks = chunker.run([doc]) print(doc_chunks["documents"]) From 80b8f2c50d72423caf9a4681997cad86f0126344 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 8 Jan 2025 11:44:29 +0100 Subject: [PATCH 73/82] adding ticks to close code block --- haystack/components/preprocessors/recursive_splitter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 8736a8ba36..1024d842e7 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -49,6 +49,7 @@ class RecursiveDocumentSplitter: >Document(id=..., content: 'AI technology is widely used throughout industry, government, and science.', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951', 'split_id': 2, 'split_idx_start': 142, '_split_overlap': []}) >Document(id=..., content: ' Some high-profile applications include advanced web search engines; recommendation systems; interac...', meta: {'original_id': '65167a9823dd883de577e828ca4fd529e6f7241f0ff616acfce454d808478951', 'split_id': 3, 'split_idx_start': 216, '_split_overlap': []}) >] + ``` """ # noqa: E501 def __init__( From 2040c7c36960e630ec2b658c6ba04836a37fd64c Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 8 Jan 2025 12:16:32 +0100 Subject: [PATCH 74/82] fixing comments --- haystack/components/preprocessors/recursive_splitter.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 1024d842e7..52abc6cc2c 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -223,8 +223,7 @@ def _chunk_length(self, text: str) -> int: The length of the chunk in words or characters. """ if self.split_units == "word": - # page breaks are counted as a single word or page breaks followed by only whitespace 1 or multiple times - # regex that matches a page break followed by only whitespace 1 or multiple times + # regex that matches a page break followed by one or multiple whitespaces if re.match(r"\f\s*", text): return 1 From 977de8e9c855fc1749bd851b63ae229b98934e1e Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 8 Jan 2025 16:09:42 +0100 Subject: [PATCH 75/82] applying changes: split with space and force keep_white_spaces=True --- haystack/components/preprocessors/recursive_splitter.py | 9 +++++---- test/components/preprocessors/test_recursive_splitter.py | 4 ++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 52abc6cc2c..22dbf9b213 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -96,7 +96,7 @@ def warm_up(self): """ Warm up the sentence tokenizer. """ - self.nltk_tokenizer = self._get_custom_sentence_tokenizer(sentence_splitter_params) + self.nltk_tokenizer = self._get_custom_sentence_tokenizer(self.sentence_splitter_params) def _check_params(self): if self.split_length < 1: @@ -227,7 +227,7 @@ def _chunk_length(self, text: str) -> int: if re.match(r"\f\s*", text): return 1 - return len(text.split()) + return len(text.split(" ")) else: return len(text) @@ -292,7 +292,8 @@ def _chunk_text(self, text: str) -> List[str]: if curr_separator == self.separators[-1]: # tried the last separator, can't split further, break the loop and fall back to # word- or character-level chunking - break + # break + return self.fall_back_to_fixed_chunking(text, self.split_units) chunks.extend(self._chunk_text(split_text)) else: current_chunk.append(split_text) @@ -326,7 +327,7 @@ def fall_back_to_fixed_chunking(self, text: str, split_units: Literal["word", "c step = self.split_length - self.split_overlap if split_units == "word": - words = text.split() + words = text.split(" ") for i in range(0, self._chunk_length(text), step): chunks.append(" ".join(words[i : i + self.split_length])) else: diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 8f2224219c..df088b6e12 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -114,6 +114,7 @@ def test_run_using_custom_sentence_tokenizer(): separators=["\n\n", "\n", "sentence", " "], sentence_splitter_params={"language": "en", "use_split_rules": True, "keep_white_spaces": False}, ) + splitter.warm_up() text = """Artificial intelligence (AI) - Introduction AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems. @@ -307,6 +308,7 @@ def test_run_split_by_sentence_count_page_breaks_split_unit_char() -> None: document_splitter = RecursiveDocumentSplitter( separators=["sentence"], split_length=28, split_overlap=0, split_unit="char" ) + document_splitter.warm_up() text = ( "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" @@ -431,6 +433,7 @@ def test_run_custom_sentence_tokenizer_document_and_overlap_char_unit(): splitter = RecursiveDocumentSplitter(split_length=25, split_overlap=10, separators=["sentence"], split_unit="char") text = "This is sentence one. This is sentence two. This is sentence three." + splitter.warm_up() doc = Document(content=text) doc_chunks = splitter.run([doc])["documents"] @@ -631,6 +634,7 @@ def test_run_split_by_sentence_count_page_breaks_word_unit() -> None: document_splitter = RecursiveDocumentSplitter( separators=["sentence"], split_length=7, split_overlap=0, split_unit="word" ) + document_splitter.warm_up() text = ( "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" From d87ffe6142757089d26c078015123de7b6c17a5a Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 8 Jan 2025 16:34:10 +0100 Subject: [PATCH 76/82] fixing some tests and replacing count words approach in more places --- .../preprocessors/recursive_splitter.py | 4 ++-- .../preprocessors/test_recursive_splitter.py | 18 +++++++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 22dbf9b213..5d22606aaa 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -124,7 +124,7 @@ def _split_chunk(self, current_chunk: str) -> Tuple[str, str]: """ if self.split_units == "word": - words = current_chunk.split() + words = current_chunk.split(" ") current_chunk = " ".join(words[: self.split_length]) remaining_words = words[self.split_length :] return current_chunk, " ".join(remaining_words) @@ -208,7 +208,7 @@ def _get_overlap(self, overlapped_chunks): prev_chunk = overlapped_chunks[-1] overlap_start = max(0, self._chunk_length(prev_chunk) - self.split_overlap) if self.split_units == "word": - word_chunks = prev_chunk.split() + word_chunks = prev_chunk.split(" ") overlap = " ".join(word_chunks[overlap_start:]) else: overlap = prev_chunk[overlap_start:] diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index df088b6e12..0ac99e4d70 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -519,7 +519,12 @@ def test_run_split_by_word_count_page_breaks_word_unit(): doc_chunks = splitter.run([doc]) doc_chunks = doc_chunks["documents"] - assert len(doc_chunks) == 5 + print("\n\n") + + for idx, c in enumerate(doc_chunks): + print(idx, c.content) + + assert len(doc_chunks) == 9 assert doc_chunks[0].content == "This is some text. " assert doc_chunks[0].meta["page_number"] == 1 assert doc_chunks[0].meta["split_id"] == 0 @@ -687,20 +692,21 @@ def test_run_split_by_sentence_tokenizer_document_and_overlap_word_unit_no_overl chunks = splitter.run([Document(content=text)])["documents"] assert len(chunks) == 3 assert chunks[0].content == "This is sentence one." - assert chunks[1].content == " This is sentence two." - assert chunks[2].content == " This is sentence three." + assert chunks[1].content == "This is sentence two." + assert chunks[2].content == "This is sentence three." def test_run_split_by_dot_and_overlap_1_word_unit(): splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=1, separators=["."], split_unit="word") text = "This is sentence one. This is sentence two. This is sentence three. This is sentence four." chunks = splitter.run([Document(content=text)])["documents"] - assert len(chunks) == 5 + assert len(chunks) == 6 assert chunks[0].content == "This is sentence one." assert chunks[1].content == "one. This is sentence" assert chunks[2].content == "sentence two. This is" assert chunks[3].content == "is sentence three. This" assert chunks[4].content == "This is sentence four." + assert chunks[5].content == "four." def test_run_trigger_dealing_with_remaining_word_larger_than_split_length(): @@ -708,7 +714,7 @@ def test_run_trigger_dealing_with_remaining_word_larger_than_split_length(): text = """A simple sentence1. A bright sentence2. A clever sentence3""" doc = Document(content=text) chunks = splitter.run([doc])["documents"] - assert len(chunks) == 7 + assert len(chunks) == 9 assert chunks[0].content == "A simple sentence1." assert chunks[1].content == "simple sentence1. A" assert chunks[2].content == "sentence1. A bright" @@ -716,6 +722,8 @@ def test_run_trigger_dealing_with_remaining_word_larger_than_split_length(): assert chunks[4].content == "bright sentence2. A" assert chunks[5].content == "sentence2. A clever" assert chunks[6].content == "A clever sentence3" + assert chunks[7].content == "clever sentence3" + assert chunks[8].content == "sentence3" def test_run_trigger_dealing_with_remaining_char_larger_than_split_length(): From df214d6215990cd0b13e170121eedee4992a4604 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 9 Jan 2025 12:14:00 +0100 Subject: [PATCH 77/82] keep_white_spaces = True only if not defined --- .../components/preprocessors/recursive_splitter.py | 11 ++++------- .../preprocessors/test_recursive_splitter.py | 5 ----- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 5d22606aaa..d684871456 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -83,14 +83,11 @@ def __init__( self.split_overlap = split_overlap self.split_units = split_unit self.separators = separators if separators else ["\n\n", "sentence", "\n", " "] # default separators - self.sentence_tokenizer_params = sentence_splitter_params self._check_params() self.nltk_tokenizer = None - self.sentence_splitter_params = sentence_splitter_params - if self.sentence_splitter_params is None: - self.sentence_splitter_params = {"keep_white_spaces": True} - else: - self.sentence_splitter_params["keep_white_spaces"] = True + self.sentence_splitter_params = ( + {"keep_white_spaces": True} if sentence_splitter_params is None else sentence_splitter_params + ) def warm_up(self): """ @@ -128,6 +125,7 @@ def _split_chunk(self, current_chunk: str) -> Tuple[str, str]: current_chunk = " ".join(words[: self.split_length]) remaining_words = words[self.split_length :] return current_chunk, " ".join(remaining_words) + # split by characters text = current_chunk current_chunk = text[: self.split_length] @@ -292,7 +290,6 @@ def _chunk_text(self, text: str) -> List[str]: if curr_separator == self.separators[-1]: # tried the last separator, can't split further, break the loop and fall back to # word- or character-level chunking - # break return self.fall_back_to_fixed_chunking(text, self.split_units) chunks.extend(self._chunk_text(split_text)) else: diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 0ac99e4d70..40181bd2c2 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -519,11 +519,6 @@ def test_run_split_by_word_count_page_breaks_word_unit(): doc_chunks = splitter.run([doc]) doc_chunks = doc_chunks["documents"] - print("\n\n") - - for idx, c in enumerate(doc_chunks): - print(idx, c.content) - assert len(doc_chunks) == 9 assert doc_chunks[0].content == "This is some text. " assert doc_chunks[0].meta["page_number"] == 1 From 951956b0735a1432ffb8f9936b58bef8bc782961 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 9 Jan 2025 16:04:08 +0100 Subject: [PATCH 78/82] cleaning docs --- .../preprocessors/recursive_splitter.py | 33 ++++++++++++------- .../preprocessors/test_recursive_splitter.py | 2 +- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index d684871456..c517c7075c 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -221,14 +221,26 @@ def _chunk_length(self, text: str) -> int: The length of the chunk in words or characters. """ if self.split_units == "word": - # regex that matches a page break followed by one or multiple whitespaces - if re.match(r"\f\s*", text): - return 1 - return len(text.split(" ")) else: return len(text) + # def _chunk_length(self, text: str) -> int: + # """ + # Split the text by whitespace and count non-empty elements + # Count newline and form feed characters + # + # :param text: + # :return: + # """ + # + # if self.split_units == "word": + # words = [word for word in text.split() if word] + # special_chars = text.count('\n') + text.count('\f') + text.count('\x0c') + # return len(words) + special_chars + # + # return len(text) + def _chunk_text(self, text: str) -> List[str]: """ Recursive chunking algorithm that divides text into smaller chunks based on a list of separator characters. @@ -247,23 +259,22 @@ def _chunk_text(self, text: str) -> List[str]: for curr_separator in self.separators: # type: ignore # the caller already checked that separators is not None if curr_separator == "sentence": - # correct SentenceSplitter initialization is checked at the initialization of the component + # re. ignore: correct SentenceSplitter initialization is checked at the initialization of the component sentence_with_spans = self.nltk_tokenizer.split_sentences(text) # type: ignore splits = [sentence["sentence"] for sentence in sentence_with_spans] else: + # add escape "\" to the separator and wrapped it in a group so that it's included in the splits as well escaped_separator = re.escape(curr_separator) - escaped_separator = ( - f"({escaped_separator})" # wrap the separator in a group to include it in the splits - ) - splits = re.split(escaped_separator, text) + escaped_separator = f"({escaped_separator})" - # merge every two consecutive splits, i.e.: the text and the separator after it + # split the text and merge every two consecutive splits, i.e.: the text and the separator after it + splits = re.split(escaped_separator, text) splits = [ "".join([splits[i], splits[i + 1]]) if i < len(splits) - 1 else splits[i] for i in range(0, len(splits), 2) ] - # remove last split if it is empty + # remove last split if it's empty splits = splits[:-1] if splits[-1] == "" else splits if len(splits) == 1: # go to next separator, if current separator not found in the text diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 40181bd2c2..d3c86f417d 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -519,7 +519,7 @@ def test_run_split_by_word_count_page_breaks_word_unit(): doc_chunks = splitter.run([doc]) doc_chunks = doc_chunks["documents"] - assert len(doc_chunks) == 9 + assert len(doc_chunks) == 5 assert doc_chunks[0].content == "This is some text. " assert doc_chunks[0].meta["page_number"] == 1 assert doc_chunks[0].meta["split_id"] == 0 From e1464eb41cb7792ae1b3f3e288c3dc942132b146 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 9 Jan 2025 19:16:29 +0100 Subject: [PATCH 79/82] handling some more edge cases, when split is still too big and all separators ran --- .../preprocessors/recursive_splitter.py | 54 ++++++++----------- .../preprocessors/test_recursive_splitter.py | 29 +++++----- 2 files changed, 38 insertions(+), 45 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index c517c7075c..b6e3dc4306 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -121,7 +121,7 @@ def _split_chunk(self, current_chunk: str) -> Tuple[str, str]: """ if self.split_units == "word": - words = current_chunk.split(" ") + words = current_chunk.split() current_chunk = " ".join(words[: self.split_length]) remaining_words = words[self.split_length :] return current_chunk, " ".join(remaining_words) @@ -201,12 +201,12 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: return overlapped_chunks - def _get_overlap(self, overlapped_chunks): + def _get_overlap(self, overlapped_chunks: List[str]) -> Tuple[str, str]: """Get the previous overlapped chunk instead of the original chunk.""" prev_chunk = overlapped_chunks[-1] overlap_start = max(0, self._chunk_length(prev_chunk) - self.split_overlap) if self.split_units == "word": - word_chunks = prev_chunk.split(" ") + word_chunks = prev_chunk.split() overlap = " ".join(word_chunks[overlap_start:]) else: overlap = prev_chunk[overlap_start:] @@ -214,32 +214,17 @@ def _get_overlap(self, overlapped_chunks): def _chunk_length(self, text: str) -> int: """ - Get the length of the chunk in words or characters. + Split the text by whitespace and count non-empty elements. - :param text: The text to be split into chunks. - :returns: - The length of the chunk in words or characters. + :param: The text to be split. + :return: The number of words in the text. """ + if self.split_units == "word": - return len(text.split(" ")) - else: - return len(text) - - # def _chunk_length(self, text: str) -> int: - # """ - # Split the text by whitespace and count non-empty elements - # Count newline and form feed characters - # - # :param text: - # :return: - # """ - # - # if self.split_units == "word": - # words = [word for word in text.split() if word] - # special_chars = text.count('\n') + text.count('\f') + text.count('\x0c') - # return len(words) + special_chars - # - # return len(text) + words = [word for word in text.split(" ") if word] + return len(words) + + return len(text) def _chunk_text(self, text: str) -> List[str]: """ @@ -299,10 +284,13 @@ def _chunk_text(self, text: str) -> List[str]: # recursively handle splits that are too large if self._chunk_length(split_text) > self.split_length: if curr_separator == self.separators[-1]: - # tried the last separator, can't split further, break the loop and fall back to - # word- or character-level chunking - return self.fall_back_to_fixed_chunking(text, self.split_units) - chunks.extend(self._chunk_text(split_text)) + # tried last separator, can't split further, do a fixed-split based on word/character + fall_back_chunks = self._fall_back_to_fixed_chunking(split_text, self.split_units) + chunks.extend(fall_back_chunks) + else: + chunks.extend(self._chunk_text(split_text)) + current_length += self._chunk_length(split_text) + else: current_chunk.append(split_text) current_length += self._chunk_length(split_text) @@ -320,9 +308,9 @@ def _chunk_text(self, text: str) -> List[str]: return chunks # if no separator worked, fall back to word- or character-level chunking - return self.fall_back_to_fixed_chunking(text, self.split_units) + return self._fall_back_to_fixed_chunking(text, self.split_units) - def fall_back_to_fixed_chunking(self, text: str, split_units: Literal["word", "char"]) -> List[str]: + def _fall_back_to_fixed_chunking(self, text: str, split_units: Literal["word", "char"]) -> List[str]: """ Fall back to a fixed chunking approach if no separator works for the text. @@ -336,7 +324,7 @@ def fall_back_to_fixed_chunking(self, text: str, split_units: Literal["word", "c if split_units == "word": words = text.split(" ") - for i in range(0, self._chunk_length(text), step): + for idx, i in enumerate(range(0, self._chunk_length(text), step)): chunks.append(" ".join(words[i : i + self.split_length])) else: for i in range(0, self._chunk_length(text), step): diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index d3c86f417d..3ea73e09fa 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -1,3 +1,5 @@ +import re + import pytest from pytest import LogCaptureFixture @@ -401,11 +403,12 @@ def test_run_split_document_with_overlap_character_unit(): def test_run_separator_exists_but_split_length_too_small_fall_back_to_character_chunking(): splitter = RecursiveDocumentSplitter(separators=[" "], split_length=2, split_unit="char") - doc = Document(content="This is some text. This is some more text.") + doc = Document(content="This is some text") result = splitter.run(documents=[doc]) - assert len(result["documents"]) == 21 + assert len(result["documents"]) == 10 for doc in result["documents"]: - assert len(doc.content) == 2 + if re.escape(doc.content) not in ["\ "]: + assert len(doc.content) == 2 def test_run_fallback_to_character_chunking_by_default_length_too_short(): @@ -475,7 +478,7 @@ def test_run_split_by_dot_count_page_breaks_word_unit() -> None: documents = document_splitter.run(documents=[Document(content=text)])["documents"] - assert len(documents) == 7 + assert len(documents) == 8 assert documents[0].content == "Sentence on page 1." assert documents[0].meta["page_number"] == 1 assert documents[0].meta["split_id"] == 0 @@ -506,11 +509,16 @@ def test_run_split_by_dot_count_page_breaks_word_unit() -> None: assert documents[5].meta["split_id"] == 5 assert documents[5].meta["split_idx_start"] == text.index(documents[5].content) - assert documents[6].content == "\f\f Sentence on page 5." + assert documents[6].content == "\f\f Sentence on page" assert documents[6].meta["page_number"] == 5 assert documents[6].meta["split_id"] == 6 assert documents[6].meta["split_idx_start"] == text.index(documents[6].content) + assert documents[7].content == " 5." + assert documents[7].meta["page_number"] == 5 + assert documents[7].meta["split_id"] == 7 + assert documents[7].meta["split_idx_start"] == text.index(documents[7].content) + def test_run_split_by_word_count_page_breaks_word_unit(): splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=0, separators=[" "], split_unit="word") @@ -687,21 +695,20 @@ def test_run_split_by_sentence_tokenizer_document_and_overlap_word_unit_no_overl chunks = splitter.run([Document(content=text)])["documents"] assert len(chunks) == 3 assert chunks[0].content == "This is sentence one." - assert chunks[1].content == "This is sentence two." - assert chunks[2].content == "This is sentence three." + assert chunks[1].content == " This is sentence two." + assert chunks[2].content == " This is sentence three." def test_run_split_by_dot_and_overlap_1_word_unit(): splitter = RecursiveDocumentSplitter(split_length=4, split_overlap=1, separators=["."], split_unit="word") text = "This is sentence one. This is sentence two. This is sentence three. This is sentence four." chunks = splitter.run([Document(content=text)])["documents"] - assert len(chunks) == 6 + assert len(chunks) == 5 assert chunks[0].content == "This is sentence one." assert chunks[1].content == "one. This is sentence" assert chunks[2].content == "sentence two. This is" assert chunks[3].content == "is sentence three. This" assert chunks[4].content == "This is sentence four." - assert chunks[5].content == "four." def test_run_trigger_dealing_with_remaining_word_larger_than_split_length(): @@ -709,7 +716,7 @@ def test_run_trigger_dealing_with_remaining_word_larger_than_split_length(): text = """A simple sentence1. A bright sentence2. A clever sentence3""" doc = Document(content=text) chunks = splitter.run([doc])["documents"] - assert len(chunks) == 9 + assert len(chunks) == 7 assert chunks[0].content == "A simple sentence1." assert chunks[1].content == "simple sentence1. A" assert chunks[2].content == "sentence1. A bright" @@ -717,8 +724,6 @@ def test_run_trigger_dealing_with_remaining_word_larger_than_split_length(): assert chunks[4].content == "bright sentence2. A" assert chunks[5].content == "sentence2. A clever" assert chunks[6].content == "A clever sentence3" - assert chunks[7].content == "clever sentence3" - assert chunks[8].content == "sentence3" def test_run_trigger_dealing_with_remaining_char_larger_than_split_length(): From 3eb532c72e79814c90c65077ed7def1543308459 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 10 Jan 2025 16:54:53 +0100 Subject: [PATCH 80/82] fixing fallback whitespaces count to fixed word/char split based on split size --- .../preprocessors/recursive_splitter.py | 21 ++++++++++++++++--- .../preprocessors/test_recursive_splitter.py | 7 +++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index b6e3dc4306..6c7b2d86d5 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -323,9 +323,24 @@ def _fall_back_to_fixed_chunking(self, text: str, split_units: Literal["word", " step = self.split_length - self.split_overlap if split_units == "word": - words = text.split(" ") - for idx, i in enumerate(range(0, self._chunk_length(text), step)): - chunks.append(" ".join(words[i : i + self.split_length])) + words = re.findall(r"\S+|\s+", text) + current_chunk = [] + current_length = 0 + + for word in words: + if word != " ": + current_chunk.append(word) + current_length += 1 + if current_length == step and current_chunk: + chunks.append("".join(current_chunk)) + current_chunk = [] + current_length = 0 + else: + current_chunk.append(word) + + if current_chunk: + chunks.append("".join(current_chunk)) + else: for i in range(0, self._chunk_length(text), step): chunks.append(text[i : i + self.split_length]) diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 3ea73e09fa..caed22c371 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -804,6 +804,13 @@ def test_run_trigger_dealing_with_remaining_char_larger_than_split_length(): def test_run_custom_split_by_dot_and_overlap_3_char_unit(): pass + document_splitter = RecursiveDocumentSplitter(separators=["."], split_length=4, split_overlap=0, split_unit="word") + text = "\x0c\x0c Sentence on page 5." + chunks = document_splitter._fall_back_to_fixed_chunking(text, split_units="word") + assert len(chunks) == 2 + assert chunks[0] == "\x0c\x0c Sentence on page" + assert chunks[1] == " 5." + def test_run_serialization_in_pipeline(): pipeline = Pipeline() From 38fce465adbd960db017e128f2ce2cd81828be21 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 10 Jan 2025 16:58:06 +0100 Subject: [PATCH 81/82] cleaning --- test/components/preprocessors/test_recursive_splitter.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index caed22c371..8f55a75b0a 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -802,8 +802,6 @@ def test_run_trigger_dealing_with_remaining_char_larger_than_split_length(): def test_run_custom_split_by_dot_and_overlap_3_char_unit(): - pass - document_splitter = RecursiveDocumentSplitter(separators=["."], split_length=4, split_overlap=0, split_unit="word") text = "\x0c\x0c Sentence on page 5." chunks = document_splitter._fall_back_to_fixed_chunking(text, split_units="word") From c5d8b2f6f8d6fc0a8f3da4a06bb7b8cb030bbcba Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 10 Jan 2025 17:02:03 +0100 Subject: [PATCH 82/82] cleaning --- haystack/components/preprocessors/recursive_splitter.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 6c7b2d86d5..3286a80d72 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -89,13 +89,13 @@ def __init__( {"keep_white_spaces": True} if sentence_splitter_params is None else sentence_splitter_params ) - def warm_up(self): + def warm_up(self) -> None: """ Warm up the sentence tokenizer. """ self.nltk_tokenizer = self._get_custom_sentence_tokenizer(self.sentence_splitter_params) - def _check_params(self): + def _check_params(self) -> None: if self.split_length < 1: raise ValueError("Split length must be at least 1 character.") if self.split_overlap < 0: @@ -314,6 +314,9 @@ def _fall_back_to_fixed_chunking(self, text: str, split_units: Literal["word", " """ Fall back to a fixed chunking approach if no separator works for the text. + Splits the text into smaller chunks based on the split_length and split_units attributes, either by words or + characters. It splits into words using whitespace as a separator. + :param text: The text to be split into chunks. :param split_units: The unit of the split_length parameter. It can be either "word" or "char". :returns: