Support direct text outputs for SemanticChunkers

chonkie-ai · Jan 7, 2025 · 9da85f1 · 9da85f1
1 parent d0cc22f
commit 9da85f1
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 23 deletions.
diff --git a/src/chonkie/chunker/sdpm.py b/src/chonkie/chunker/sdpm.py
@@ -1,6 +1,6 @@
 """Semantic Double Pass Merge chunking using sentence embeddings."""
 
-from typing import Any, List, Union
+from typing import Any, List, Union, Literal
 
 from chonkie.types import SemanticChunk, Sentence
 
@@ -17,15 +17,17 @@ class SDPMChunker(SemanticChunker):
 
     Args:
         embedding_model: Sentence embedding model to use
-        similarity_threshold: Minimum similarity score to consider sentences similar
-        similarity_percentile: Minimum similarity percentile to consider sentences similar
+        mode: Mode for grouping sentences, either "cumulative" or "window"
+        threshold: Threshold for semantic similarity (0-1) or percentile (1-100), defaults to "auto"
         chunk_size: Maximum token count for a chunk
-        initial_sentences: Number of sentences to consider for initial grouping
-        skip_window: Number of chunks to skip when looking for similarities
+        similarity_window: Number of sentences to consider for similarity threshold calculation
+        min_sentences: Minimum number of sentences per chunk
         min_chunk_size: Minimum number of tokens per sentence
-
-    Methods:
-        chunk: Split text into chunks using the SDPM approach.
+        min_characters_per_sentence: Minimum number of characters per sentence
+        threshold_step: Step size for similarity threshold calculation
+        delim: Delimiters to split sentences on
+        skip_window: Number of chunks to skip when looking for similarities
+        return_type: Whether to return chunks or texts
 
     """
 
@@ -42,6 +44,7 @@ def __init__(
         threshold_step: float = 0.01,
         delim: Union[str, List[str]] = [".", "!", "?", "\n"],
         skip_window: int = 1,
+        return_type: Literal["chunks", "texts"] = "chunks",
         **kwargs
     ):
         """Initialize the SDPMChunker.
@@ -58,6 +61,7 @@ def __init__(
             threshold_step: Step size for similarity threshold calculation
             delim: Delimiters to split sentences on
             skip_window: Number of chunks to skip when looking for similarities
+            return_type: Whether to return chunks or texts
             **kwargs: Additional keyword arguments
 
         """
@@ -72,6 +76,7 @@ def __init__(
             min_characters_per_sentence=min_characters_per_sentence,
             threshold_step=threshold_step,
             delim=delim,
+            return_type=return_type,
             **kwargs
         )
         self.skip_window = skip_window

diff --git a/src/chonkie/chunker/semantic.py b/src/chonkie/chunker/semantic.py
@@ -1,7 +1,7 @@
 """Semantic chunking using sentence embeddings."""
 
 import warnings
-from typing import List, Union
+from typing import List, Union, Literal
 
 import numpy as np
 
@@ -24,7 +24,10 @@ class SemanticChunker(BaseChunker):
         min_chunk_size: Minimum number of tokens per sentence (defaults to 2)
         threshold_step: Step size for similarity threshold calculation
         delim: Delimiters to split sentences on
+        return_type: Whether to return chunks or texts
     
+    Raises:
+        ValueError: If parameters are invalid
     """
 
     def __init__(
@@ -39,6 +42,7 @@ def __init__(
         min_characters_per_sentence: int = 12,
         threshold_step: float = 0.01,
         delim: Union[str, List[str]] = [".", "!", "?", "\n"],
+        return_type: Literal["chunks", "texts"] = "chunks",
         **kwargs
     ):
         """Initialize the SemanticChunker.
@@ -56,6 +60,7 @@ def __init__(
             min_chunk_size: Minimum number of tokens per chunk (and sentence, defaults to 2)
             threshold_step: Step size for similarity threshold calculation
             delim: Delimiters to split sentences on
+            return_type: Whether to return chunks or texts
             **kwargs: Additional keyword arguments
 
         Raises:
@@ -85,6 +90,8 @@ def __init__(
             raise ValueError("threshold (float) must be between 0 and 1")
         elif type(threshold) == int and (threshold < 1 or threshold > 100):
             raise ValueError("threshold (int) must be between 1 and 100")
+        if return_type not in ["chunks", "texts"]:
+            raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")
 
         self.mode = mode
         self.chunk_size = chunk_size
@@ -96,6 +103,7 @@ def __init__(
         self.threshold_step = threshold_step
         self.delim = delim
         self.sep = "🦛"
+        self.return_type = return_type
 
         if isinstance(threshold, float):
             self.similarity_threshold = threshold
@@ -453,24 +461,27 @@ def _group_sentences(self, sentences: List[Sentence]) -> List[List[Sentence]]:
             return self._group_sentences_window(sentences)
 
     def _create_chunk(
-        self, sentences: List[Sentence], similarity_scores: List[float] = None
+        self, sentences: List[Sentence]
     ) -> SemanticChunk:
         """Create a chunk from a list of sentences."""
         if not sentences:
             raise ValueError("Cannot create chunk from empty sentence list")
-
-        # Compute chunk text and token count from sentences
-        text = "".join(sent.text for sent in sentences)
-        token_count = sum(sent.token_count for sent in sentences)
-
-        return SemanticChunk(
-            text=text,
-            start_index=sentences[0].start_index,
-            end_index=sentences[-1].end_index,
-            token_count=token_count,
-            sentences=sentences,
-        )
-
+        if self.return_type == "chunks":
+            # Compute chunk text and token count from sentences
+            text = "".join(sent.text for sent in sentences)
+            token_count = sum(sent.token_count for sent in sentences)
+            return SemanticChunk(
+                text=text,
+                start_index=sentences[0].start_index,
+                end_index=sentences[-1].end_index,
+                token_count=token_count,
+                sentences=sentences,
+            )
+        elif self.return_type == "texts":
+            return "".join(sent.text for sent in sentences)
+        else:
+            raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")
+
     def _split_chunks(
         self, sentence_groups: List[List[Sentence]]
     ) -> List[SemanticChunk]: