Skip to content

Commit

Permalink
Support direct text outputs for SemanticChunkers
Browse files Browse the repository at this point in the history
  • Loading branch information
bhavnicksm committed Jan 7, 2025
1 parent d0cc22f commit 9da85f1
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 23 deletions.
21 changes: 13 additions & 8 deletions src/chonkie/chunker/sdpm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Semantic Double Pass Merge chunking using sentence embeddings."""

from typing import Any, List, Union
from typing import Any, List, Union, Literal

from chonkie.types import SemanticChunk, Sentence

Expand All @@ -17,15 +17,17 @@ class SDPMChunker(SemanticChunker):
Args:
embedding_model: Sentence embedding model to use
similarity_threshold: Minimum similarity score to consider sentences similar
similarity_percentile: Minimum similarity percentile to consider sentences similar
mode: Mode for grouping sentences, either "cumulative" or "window"
threshold: Threshold for semantic similarity (0-1) or percentile (1-100), defaults to "auto"
chunk_size: Maximum token count for a chunk
initial_sentences: Number of sentences to consider for initial grouping
skip_window: Number of chunks to skip when looking for similarities
similarity_window: Number of sentences to consider for similarity threshold calculation
min_sentences: Minimum number of sentences per chunk
min_chunk_size: Minimum number of tokens per sentence
Methods:
chunk: Split text into chunks using the SDPM approach.
min_characters_per_sentence: Minimum number of characters per sentence
threshold_step: Step size for similarity threshold calculation
delim: Delimiters to split sentences on
skip_window: Number of chunks to skip when looking for similarities
return_type: Whether to return chunks or texts
"""

Expand All @@ -42,6 +44,7 @@ def __init__(
threshold_step: float = 0.01,
delim: Union[str, List[str]] = [".", "!", "?", "\n"],
skip_window: int = 1,
return_type: Literal["chunks", "texts"] = "chunks",
**kwargs
):
"""Initialize the SDPMChunker.
Expand All @@ -58,6 +61,7 @@ def __init__(
threshold_step: Step size for similarity threshold calculation
delim: Delimiters to split sentences on
skip_window: Number of chunks to skip when looking for similarities
return_type: Whether to return chunks or texts
**kwargs: Additional keyword arguments
"""
Expand All @@ -72,6 +76,7 @@ def __init__(
min_characters_per_sentence=min_characters_per_sentence,
threshold_step=threshold_step,
delim=delim,
return_type=return_type,
**kwargs
)
self.skip_window = skip_window
Expand Down
41 changes: 26 additions & 15 deletions src/chonkie/chunker/semantic.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Semantic chunking using sentence embeddings."""

import warnings
from typing import List, Union
from typing import List, Union, Literal

import numpy as np

Expand All @@ -24,7 +24,10 @@ class SemanticChunker(BaseChunker):
min_chunk_size: Minimum number of tokens per sentence (defaults to 2)
threshold_step: Step size for similarity threshold calculation
delim: Delimiters to split sentences on
return_type: Whether to return chunks or texts
Raises:
ValueError: If parameters are invalid
"""

def __init__(
Expand All @@ -39,6 +42,7 @@ def __init__(
min_characters_per_sentence: int = 12,
threshold_step: float = 0.01,
delim: Union[str, List[str]] = [".", "!", "?", "\n"],
return_type: Literal["chunks", "texts"] = "chunks",
**kwargs
):
"""Initialize the SemanticChunker.
Expand All @@ -56,6 +60,7 @@ def __init__(
min_chunk_size: Minimum number of tokens per chunk (and sentence, defaults to 2)
threshold_step: Step size for similarity threshold calculation
delim: Delimiters to split sentences on
return_type: Whether to return chunks or texts
**kwargs: Additional keyword arguments
Raises:
Expand Down Expand Up @@ -85,6 +90,8 @@ def __init__(
raise ValueError("threshold (float) must be between 0 and 1")
elif type(threshold) == int and (threshold < 1 or threshold > 100):
raise ValueError("threshold (int) must be between 1 and 100")
if return_type not in ["chunks", "texts"]:
raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")

self.mode = mode
self.chunk_size = chunk_size
Expand All @@ -96,6 +103,7 @@ def __init__(
self.threshold_step = threshold_step
self.delim = delim
self.sep = "🦛"
self.return_type = return_type

if isinstance(threshold, float):
self.similarity_threshold = threshold
Expand Down Expand Up @@ -453,24 +461,27 @@ def _group_sentences(self, sentences: List[Sentence]) -> List[List[Sentence]]:
return self._group_sentences_window(sentences)

def _create_chunk(
self, sentences: List[Sentence], similarity_scores: List[float] = None
self, sentences: List[Sentence]
) -> SemanticChunk:
"""Create a chunk from a list of sentences."""
if not sentences:
raise ValueError("Cannot create chunk from empty sentence list")

# Compute chunk text and token count from sentences
text = "".join(sent.text for sent in sentences)
token_count = sum(sent.token_count for sent in sentences)

return SemanticChunk(
text=text,
start_index=sentences[0].start_index,
end_index=sentences[-1].end_index,
token_count=token_count,
sentences=sentences,
)

if self.return_type == "chunks":
# Compute chunk text and token count from sentences
text = "".join(sent.text for sent in sentences)
token_count = sum(sent.token_count for sent in sentences)
return SemanticChunk(
text=text,
start_index=sentences[0].start_index,
end_index=sentences[-1].end_index,
token_count=token_count,
sentences=sentences,
)
elif self.return_type == "texts":
return "".join(sent.text for sent in sentences)
else:
raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")

def _split_chunks(
self, sentence_groups: List[List[Sentence]]
) -> List[SemanticChunk]:
Expand Down

0 comments on commit 9da85f1

Please sign in to comment.