Skip to content

Commit

Permalink
Improve pydocs
Browse files Browse the repository at this point in the history
  • Loading branch information
vblagoje committed Sep 10, 2024
1 parent 58f290f commit 9af91c0
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 0 deletions.
18 changes: 18 additions & 0 deletions haystack/components/preprocessors/nltk_document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,14 @@ def __init__(
self.language = language

def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
"""
Splits the text into units based on the specified split_by parameter.
:param text: The text to split.
:param split_by: The unit to split the text by. Choose from "word", "sentence", "passage", or "page".
:returns: A list of units.
"""

if split_by == "page":
self.split_at = "\f"
units = text.split(self.split_at)
Expand Down Expand Up @@ -148,6 +156,11 @@ def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
def _number_of_sentences_to_keep(sentences: List[str], split_length: int, split_overlap: int) -> int:
"""
Returns the number of sentences to keep in the next chunk based on the `split_overlap` and `split_length`.
:param sentences: The list of sentences to split.
:param split_length: The maximum number of words in each split.
:param split_overlap: The number of overlapping words in each split.
:returns: The number of sentences to keep in the next chunk.
"""
# If the split_overlap is 0, we don't need to keep any sentences
if split_overlap == 0:
Expand All @@ -170,6 +183,11 @@ def _concatenate_sentences_based_on_word_amount(
) -> Tuple[List[str], List[int], List[int]]:
"""
Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.
:param sentences: The list of sentences to split.
:param split_length: The maximum number of words in each split.
:param split_overlap: The number of overlapping words in each split.
:returns: A tuple containing the concatenated sentences, the start page numbers, and the start indices.
"""
# Chunk information
chunk_word_count = 0
Expand Down
7 changes: 7 additions & 0 deletions haystack/components/preprocessors/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,13 @@ def split_sentences(self, text: str) -> List[Dict[str, Any]]:
return sentences

def _apply_split_rules(self, text: str, sentence_spans: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
"""
Applies additional split rules to the sentence spans.
:param text: The text to split.
:param sentence_spans: The list of sentence spans to split.
:returns: The list of sentence spans after applying the split rules.
"""
new_sentence_spans = []
quote_spans = [match.span() for match in re.finditer(r"\W(\"+|\'+).*?\1", text)]
while sentence_spans:
Expand Down

0 comments on commit 9af91c0

Please sign in to comment.