From 6910b92c16eb6fed69c8ad8b4c3033f0441fc088 Mon Sep 17 00:00:00 2001 From: udayk02 Date: Thu, 2 Jan 2025 18:08:18 +0530 Subject: [PATCH 1/4] bugfix #116 --- src/chonkie/chunker/token.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py index 1912163..93cf09a 100644 --- a/src/chonkie/chunker/token.py +++ b/src/chonkie/chunker/token.py @@ -52,28 +52,27 @@ def __init__( def _create_chunks( self, - chunk_texts: List[str], token_counts: List[int], - decoded_text: str, + token_groups: List[List[int]] ) -> List[Chunk]: """Create chunks from a list of texts.""" # package everything as Chunk objects and send out the result + chunk_texts = self._decode_batch(token_groups) chunks = [] current_index = 0 - for chunk_text, token_count in zip(chunk_texts, token_counts): - start_index = decoded_text.find( - chunk_text, current_index - ) # Find needs to be run every single time because of unknown overlap length - end_index = start_index + len(chunk_text) + for chunk_text, token_count, token_group in zip(chunk_texts, token_counts, token_groups): + end_index = current_index + len(chunk_text) chunks.append( Chunk( text=chunk_text, - start_index=start_index, + start_index=current_index, end_index=end_index, token_count=token_count, ) ) - current_index = end_index + # we subtract the space taken by the overlapping text, that gives you the start_index for the next chunk + overlap_tokens = self.chunk_overlap - (self.chunk_size - len(token_group)) + current_index = end_index - len("".join(self._decode_batch([token_group[-overlap_tokens:]]))) return chunks def chunk(self, text: str) -> List[Chunk]: @@ -92,9 +91,6 @@ def chunk(self, text: str) -> List[Chunk]: # Encode full text text_tokens = self._encode(text) - # We decode the text because the tokenizer might result in a different output than text - decoded_text = self._decode(text_tokens) - # Calculate chunk positions token_groups = [ text_tokens[ @@ -108,11 +104,7 @@ def chunk(self, text: str) -> List[Chunk]: len(toks) for toks in token_groups ] # get the token counts; it's prolly chunk_size, but len doesn't take too long - chunk_texts = self._decode_batch( - token_groups - ) # decrease the time by decoding in one go (?) - - chunks = self._create_chunks(chunk_texts, token_counts, decoded_text) + chunks = self._create_chunks(token_counts, token_groups) return chunks From 83940b9076103bb470f8d393ccaa85352346c468 Mon Sep 17 00:00:00 2001 From: udayk02 Date: Fri, 3 Jan 2025 12:26:52 +0530 Subject: [PATCH 2/4] update: bugfix #116 - removed the unnecessary `join` as there is only one token_group. - replaced `_decode_batch` with `_decode` --- src/chonkie/chunker/token.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py index 93cf09a..5e7f4b1 100644 --- a/src/chonkie/chunker/token.py +++ b/src/chonkie/chunker/token.py @@ -72,7 +72,7 @@ def _create_chunks( ) # we subtract the space taken by the overlapping text, that gives you the start_index for the next chunk overlap_tokens = self.chunk_overlap - (self.chunk_size - len(token_group)) - current_index = end_index - len("".join(self._decode_batch([token_group[-overlap_tokens:]]))) + current_index = end_index - len(self._decode(token_group[-overlap_tokens:])) return chunks def chunk(self, text: str) -> List[Chunk]: From 53d532de3c3abd1d7cb681e409c4d11fe7e8d5d8 Mon Sep 17 00:00:00 2001 From: udayk02 Date: Fri, 3 Jan 2025 21:49:29 +0530 Subject: [PATCH 3/4] update: bugfix #116 - `start_index` remains 0 when `chunk_overlap` is 0, fixed it. --- src/chonkie/chunker/token.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py index 5e7f4b1..8cf8e2b 100644 --- a/src/chonkie/chunker/token.py +++ b/src/chonkie/chunker/token.py @@ -70,9 +70,11 @@ def _create_chunks( token_count=token_count, ) ) + # we subtract the space taken by the overlapping text, that gives you the start_index for the next chunk overlap_tokens = self.chunk_overlap - (self.chunk_size - len(token_group)) - current_index = end_index - len(self._decode(token_group[-overlap_tokens:])) + current_index = end_index - len(self._decode(token_group[-overlap_tokens:] if overlap_tokens > 0 else [])) + return chunks def chunk(self, text: str) -> List[Chunk]: From e069fb7ad9c3d4c56d959f4cbd121d14036585b1 Mon Sep 17 00:00:00 2001 From: udayk02 Date: Sat, 4 Jan 2025 10:33:56 +0530 Subject: [PATCH 4/4] update: bugfix #116 - applies only when chunk_overlap > 0 - batch decoding for overlap texts --- src/chonkie/chunker/token.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py index 8cf8e2b..94e1d73 100644 --- a/src/chonkie/chunker/token.py +++ b/src/chonkie/chunker/token.py @@ -60,7 +60,16 @@ def _create_chunks( chunk_texts = self._decode_batch(token_groups) chunks = [] current_index = 0 - for chunk_text, token_count, token_group in zip(chunk_texts, token_counts, token_groups): + + if (self.chunk_overlap > 0): + overlap_tokens_space = [ + # we get the space taken by the overlapping text, that gives you the start_index for the next chunk + len(overlap_text) + for overlap_text in self._decode_batch([token_group[-(self.chunk_overlap - (self.chunk_size - len(token_group))):] + for token_group in token_groups]) + ] + + for i, (chunk_text, token_count) in enumerate(zip(chunk_texts, token_counts)): end_index = current_index + len(chunk_text) chunks.append( Chunk( @@ -71,9 +80,7 @@ def _create_chunks( ) ) - # we subtract the space taken by the overlapping text, that gives you the start_index for the next chunk - overlap_tokens = self.chunk_overlap - (self.chunk_size - len(token_group)) - current_index = end_index - len(self._decode(token_group[-overlap_tokens:] if overlap_tokens > 0 else [])) + current_index = end_index - (overlap_tokens_space[i] if self.chunk_overlap > 0 else 0) return chunks