Skip to content

Commit

Permalink
Merge branch 'main' into fix-84
Browse files Browse the repository at this point in the history
  • Loading branch information
bhavnicksm authored Dec 27, 2024
2 parents 681fcc6 + 0a159a4 commit 5c3c290
Show file tree
Hide file tree
Showing 6 changed files with 431 additions and 127 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ Chonkie provides several chunkers to help you split your text efficiently for RA
- **TokenChunker**: Splits text into fixed-size token chunks.
- **WordChunker**: Splits text into chunks based on words.
- **SentenceChunker**: Splits text into chunks based on sentences.
- **RecursiveChunker**: Splits text hierarchically using customizable rules to create semantically meaningful chunks.
- **SemanticChunker**: Splits text into chunks based on semantic similarity.
- **SDPMChunker**: Splits text using a Semantic Double-Pass Merge approach.
- **LateChunker (experimental)**: Embeds text and then splits it to have better chunk embeddings.
Expand Down Expand Up @@ -125,7 +126,7 @@ And of course, special thanks to [Moto Moto](https://www.youtube.com/watch?v=I0z

If you use Chonkie in your research, please cite it as follows:

```
```bibtex
@misc{chonkie2024,
author = {Minhas, Bhavnick},
title = {Chonkie: A Fast Feature-full Chunking Library for RAG Bots},
Expand Down
12 changes: 10 additions & 2 deletions src/chonkie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

from .chunker import (
BaseChunker,
LateChunker,
RecursiveChunker,
SDPMChunker,
SemanticChunker,
SentenceChunker,
TokenChunker,
WordChunker,
LateChunker,
)
from .embeddings import (
AutoEmbeddings,
Expand All @@ -23,11 +24,14 @@
from .types import (
Chunk,
Context,
LateChunk,
RecursiveChunk,
RecursiveLevel,
RecursiveRules,
SemanticChunk,
SemanticSentence,
Sentence,
SentenceChunk,
LateChunk,
)

__version__ = "0.3.0"
Expand All @@ -45,6 +49,9 @@
__all__ += [
"Context",
"Chunk",
"RecursiveChunk",
"RecursiveLevel",
"RecursiveRules",
"SentenceChunk",
"SemanticChunk",
"Sentence",
Expand All @@ -61,6 +68,7 @@
"SemanticChunker",
"SDPMChunker",
"LateChunker",
"RecursiveChunker",
]

# Add all embeddings classes to __all__
Expand Down
4 changes: 3 additions & 1 deletion src/chonkie/chunker/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
"""Module for chunkers."""

from .base import BaseChunker
from .late import LateChunker
from .recursive import RecursiveChunker
from .sdpm import SDPMChunker
from .semantic import SemanticChunker
from .sentence import SentenceChunker
from .token import TokenChunker
from .word import WordChunker
from .late import LateChunker

__all__ = [
"BaseChunker",
"RecursiveChunker",
"TokenChunker",
"WordChunker",
"SentenceChunker",
Expand Down
123 changes: 1 addition & 122 deletions src/chonkie/chunker/recursive.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,130 +6,9 @@
from typing import Any, List, Optional, Union

from chonkie.chunker.base import BaseChunker
from chonkie.types import Chunk
from chonkie.types import Chunk, RecursiveChunk, RecursiveRules, RecursiveLevel


@dataclass
class RecursiveLevel:
"""Configuration for a single level of recursive chunking.
Attributes:
delimiters: The delimiters to use for the level. If None, that level will use tokens to determine chunk boundaries.
whitespace: Whether to use whitespace as a delimiter.
"""

delimiters: Union[List[str], str, None] = None
whitespace: bool = False

def __post_init__(self):
"""Post-initialize the recursive level."""
if self.delimiters is not None and self.whitespace:
raise ValueError("Cannot have both delimiters and whitespace. "
"Use two separate levels instead, one for whitespace and one for delimiters.")

def __repr__(self) -> str:
"""Get a string representation of the recursive level."""
return f"RecursiveLevel(delimiters={self.delimiters}, whitespace={self.whitespace})"

def __str__(self) -> str:
"""Get a string representation of the recursive level."""
return f"RecursiveLevel(delimiters={self.delimiters}, whitespace={self.whitespace})"

@dataclass
class RecursiveRules:
"""Collection of rules for recursive chunking."""

levels: Union[List[RecursiveLevel], RecursiveLevel, None] = None

def __post_init__(self):
"""Initialize the recursive rules if not already initialized."""
# Set default levels if not already initialized
if self.levels is None:
# First level should be paragraphs
paragraph_level = RecursiveLevel(delimiters=["\n\n", "\n", "\r\n"],
whitespace=False)
# Second level should be sentences
sentence_level = RecursiveLevel(delimiters=[".", "?", "!"],
whitespace=False)

# Third level can be sub-sentences, like '...', ',', ';', ':', etc.
sub_sentence_level = RecursiveLevel(delimiters=[',',
';',
':',
'...',
'-',
'(',
')',
'[',
']',
'{',
'}',
'<',
'>',
'|',
'~',
'`',
'\'',
'\"'
],
whitespace=False)

# Fourth level should be words
word_level = RecursiveLevel(delimiters=None,
whitespace=True)
# Fifth level should be tokens
# NOTE: When delimiters is None, the level will use tokens to determine chunk boundaries.
token_level = RecursiveLevel(delimiters=None,
whitespace=False)
self.levels = [paragraph_level,
sentence_level,
sub_sentence_level,
word_level,
token_level]

def __iter__(self):
"""Iterate over the levels."""
return iter(self.levels)

def __getitem__(self, index: int) -> RecursiveLevel:
"""Get a level by index."""
return self.levels[index]

def __len__(self) -> int:
"""Get the number of levels."""
return len(self.levels)

def __repr__(self) -> str:
"""Get a string representation of the recursive rules."""
return f"RecursiveRules(levels={self.levels})"

def __str__(self) -> str:
"""Get a string representation of the recursive rules."""
return f"RecursiveRules(levels={self.levels})"


@dataclass
class RecursiveChunk(Chunk):
"""A Chunk with a level attribute."""

level: Union[int, None] = None

def __repr__(self) -> str:
"""Get a string representation of the recursive chunk."""
return (f"RecursiveChunk(text={self.text}, "
f"start_index={self.start_index}, "
f"end_index={self.end_index}, "
f"token_count={self.token_count}, "
f"level={self.level})")

def __str__(self) -> str:
"""Get a string representation of the recursive chunk."""
return (f"RecursiveChunk(text={self.text}, "
f"start_index={self.start_index}, "
f"end_index={self.end_index}, "
f"token_count={self.token_count}, "
f"level={self.level})")

class RecursiveChunker(BaseChunker):
"""Chunker that uses recursive rules to chunk text.
Expand Down
143 changes: 142 additions & 1 deletion src/chonkie/types.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Dataclasses for Chonkie."""

from dataclasses import dataclass, field
from typing import TYPE_CHECKING, List, Optional
from typing import TYPE_CHECKING, List, Optional, Union

if TYPE_CHECKING:
import numpy as np
Expand Down Expand Up @@ -235,3 +235,144 @@ class LateChunk(Chunk):

sentences: List[LateSentence] = field(default_factory=list)
embedding: Optional["np.ndarray"] = field(default=None)

@dataclass
class RecursiveLevel:
"""Configuration for a single level of recursive chunking.
Attributes:
delimiters: The delimiters to use for the level. If None, that level will use tokens to determine chunk boundaries.
whitespace: Whether to use whitespace as a delimiter.
"""

delimiters: Union[List[str], str, None] = None
whitespace: bool = False

def __post_init__(self):
"""Post-initialize the recursive level."""
self.validate()

def validate(self):
"""Validate the recursive level."""
if self.delimiters is not None and self.whitespace:
raise ValueError("Cannot have both delimiters and whitespace. "
"Use two separate levels instead, one for whitespace and one for delimiters.")
if self.delimiters is not None:
for delimiter in self.delimiters:
if not isinstance(delimiter, str):
raise ValueError("All delimiters must be strings")
if len(delimiter) == 0:
raise ValueError("All delimiters must be non-empty strings")
if delimiter == " ":
raise ValueError("Cannot use whitespace as a delimiter",
"Use whitespace=True instead")

def __repr__(self) -> str:
"""Get a string representation of the recursive level."""
return f"RecursiveLevel(delimiters={self.delimiters}, whitespace={self.whitespace})"

def __str__(self) -> str:
"""Get a string representation of the recursive level."""
return f"RecursiveLevel(delimiters={self.delimiters}, whitespace={self.whitespace})"

@dataclass
class RecursiveRules:
"""Collection of rules for recursive chunking."""

levels: Union[List[RecursiveLevel], RecursiveLevel, None] = None

def __post_init__(self):
"""Initialize the recursive rules if not already initialized."""
# Set default levels if not already initialized
if self.levels is None:
# First level should be paragraphs
paragraph_level = RecursiveLevel(delimiters=["\n\n", "\n", "\r\n"],
whitespace=False)
# Second level should be sentences
sentence_level = RecursiveLevel(delimiters=[".", "?", "!"],
whitespace=False)

# Third level can be sub-sentences, like '...', ',', ';', ':', etc.
sub_sentence_level = RecursiveLevel(delimiters=[',',
';',
':',
'...',
'-',
'(',
')',
'[',
']',
'{',
'}',
'<',
'>',
'|',
'~',
'`',
'\'',
'\"'
],
whitespace=False)

# Fourth level should be words
word_level = RecursiveLevel(delimiters=None,
whitespace=True)
# Fifth level should be tokens
# NOTE: When delimiters is None, the level will use tokens to determine chunk boundaries.
token_level = RecursiveLevel(delimiters=None,
whitespace=False)
self.levels = [paragraph_level,
sentence_level,
sub_sentence_level,
word_level,
token_level]
else:
if isinstance(self.levels, RecursiveLevel):
self.levels.validate()
elif isinstance(self.levels, list) and all(isinstance(level, RecursiveLevel) for level in self.levels):
for level in self.levels:
level.validate()

def __iter__(self):
"""Iterate over the levels."""
return iter(self.levels)

def __getitem__(self, index: int) -> RecursiveLevel:
"""Get a level by index."""
return self.levels[index]

def __len__(self) -> int:
"""Get the number of levels."""
return len(self.levels)

def __repr__(self) -> str:
"""Get a string representation of the recursive rules."""
return f"RecursiveRules(levels={self.levels})"

def __str__(self) -> str:
"""Get a string representation of the recursive rules."""
return f"RecursiveRules(levels={self.levels})"


@dataclass
class RecursiveChunk(Chunk):
"""A Chunk with a level attribute."""

level: Union[int, None] = None

def __repr__(self) -> str:
"""Get a string representation of the recursive chunk."""
return (f"RecursiveChunk(text={self.text}, "
f"start_index={self.start_index}, "
f"end_index={self.end_index}, "
f"token_count={self.token_count}, "
f"level={self.level})")

def __str__(self) -> str:
"""Get a string representation of the recursive chunk."""
return (f"RecursiveChunk(text={self.text}, "
f"start_index={self.start_index}, "
f"end_index={self.end_index}, "
f"token_count={self.token_count}, "
f"level={self.level})")
Loading

0 comments on commit 5c3c290

Please sign in to comment.