From 6910b92c16eb6fed69c8ad8b4c3033f0441fc088 Mon Sep 17 00:00:00 2001
From: udayk02 <udaykirankarusodi@gmail.com>
Date: Thu, 2 Jan 2025 18:08:18 +0530
Subject: [PATCH 1/4] bugfix #116

---
 src/chonkie/chunker/token.py | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
index 1912163..93cf09a 100644
--- a/src/chonkie/chunker/token.py
+++ b/src/chonkie/chunker/token.py
@@ -52,28 +52,27 @@ def __init__(
     
     def _create_chunks(
         self,
-        chunk_texts: List[str],
         token_counts: List[int],
-        decoded_text: str,
+        token_groups: List[List[int]]
     ) -> List[Chunk]:
         """Create chunks from a list of texts."""
         # package everything as Chunk objects and send out the result
+        chunk_texts = self._decode_batch(token_groups)
         chunks = []
         current_index = 0
-        for chunk_text, token_count in zip(chunk_texts, token_counts):
-            start_index = decoded_text.find(
-                chunk_text, current_index
-            )  # Find needs to be run every single time because of unknown overlap length
-            end_index = start_index + len(chunk_text)
+        for chunk_text, token_count, token_group in zip(chunk_texts, token_counts, token_groups):
+            end_index = current_index + len(chunk_text)
             chunks.append(
                 Chunk(
                     text=chunk_text,
-                    start_index=start_index,
+                    start_index=current_index,
                     end_index=end_index,
                     token_count=token_count,
                 )
             )
-            current_index = end_index
+            # we subtract the space taken by the overlapping text, that gives you the start_index for the next chunk
+            overlap_tokens = self.chunk_overlap - (self.chunk_size - len(token_group))
+            current_index = end_index - len("".join(self._decode_batch([token_group[-overlap_tokens:]])))
         return chunks
 
     def chunk(self, text: str) -> List[Chunk]:
@@ -92,9 +91,6 @@ def chunk(self, text: str) -> List[Chunk]:
         # Encode full text
         text_tokens = self._encode(text)
 
-        # We decode the text because the tokenizer might result in a different output than text
-        decoded_text = self._decode(text_tokens)
-
         # Calculate chunk positions
         token_groups = [
             text_tokens[
@@ -108,11 +104,7 @@ def chunk(self, text: str) -> List[Chunk]:
             len(toks) for toks in token_groups
         ]  # get the token counts; it's prolly chunk_size, but len doesn't take too long
 
-        chunk_texts = self._decode_batch(
-            token_groups
-        )  # decrease the time by decoding in one go (?)
-
-        chunks = self._create_chunks(chunk_texts, token_counts, decoded_text)
+        chunks = self._create_chunks(token_counts, token_groups)
 
         return chunks
 

From 83940b9076103bb470f8d393ccaa85352346c468 Mon Sep 17 00:00:00 2001
From: udayk02 <udaykirankarusodi@gmail.com>
Date: Fri, 3 Jan 2025 12:26:52 +0530
Subject: [PATCH 2/4] update: bugfix #116

- removed the unnecessary `join` as there is only one token_group.
- replaced `_decode_batch` with `_decode`
---
 src/chonkie/chunker/token.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
index 93cf09a..5e7f4b1 100644
--- a/src/chonkie/chunker/token.py
+++ b/src/chonkie/chunker/token.py
@@ -72,7 +72,7 @@ def _create_chunks(
             )
             # we subtract the space taken by the overlapping text, that gives you the start_index for the next chunk
             overlap_tokens = self.chunk_overlap - (self.chunk_size - len(token_group))
-            current_index = end_index - len("".join(self._decode_batch([token_group[-overlap_tokens:]])))
+            current_index = end_index - len(self._decode(token_group[-overlap_tokens:]))
         return chunks
 
     def chunk(self, text: str) -> List[Chunk]:

From 53d532de3c3abd1d7cb681e409c4d11fe7e8d5d8 Mon Sep 17 00:00:00 2001
From: udayk02 <udaykirankarusodi@gmail.com>
Date: Fri, 3 Jan 2025 21:49:29 +0530
Subject: [PATCH 3/4] update: bugfix #116 - `start_index` remains 0 when
 `chunk_overlap` is 0, fixed it.

---
 src/chonkie/chunker/token.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
index 5e7f4b1..8cf8e2b 100644
--- a/src/chonkie/chunker/token.py
+++ b/src/chonkie/chunker/token.py
@@ -70,9 +70,11 @@ def _create_chunks(
                     token_count=token_count,
                 )
             )
+            
             # we subtract the space taken by the overlapping text, that gives you the start_index for the next chunk
             overlap_tokens = self.chunk_overlap - (self.chunk_size - len(token_group))
-            current_index = end_index - len(self._decode(token_group[-overlap_tokens:]))
+            current_index = end_index - len(self._decode(token_group[-overlap_tokens:] if overlap_tokens > 0 else []))
+        
         return chunks
 
     def chunk(self, text: str) -> List[Chunk]:

From e069fb7ad9c3d4c56d959f4cbd121d14036585b1 Mon Sep 17 00:00:00 2001
From: udayk02 <udaykirankarusodi@gmail.com>
Date: Sat, 4 Jan 2025 10:33:56 +0530
Subject: [PATCH 4/4] update: bugfix #116 - applies only when chunk_overlap > 0
 - batch decoding for overlap texts

---
 src/chonkie/chunker/token.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
index 8cf8e2b..94e1d73 100644
--- a/src/chonkie/chunker/token.py
+++ b/src/chonkie/chunker/token.py
@@ -60,7 +60,16 @@ def _create_chunks(
         chunk_texts = self._decode_batch(token_groups)
         chunks = []
         current_index = 0
-        for chunk_text, token_count, token_group in zip(chunk_texts, token_counts, token_groups):
+
+        if (self.chunk_overlap > 0):
+            overlap_tokens_space = [
+                # we get the space taken by the overlapping text, that gives you the start_index for the next chunk
+                len(overlap_text)
+                for overlap_text in self._decode_batch([token_group[-(self.chunk_overlap - (self.chunk_size - len(token_group))):]
+                                                        for token_group in token_groups])
+            ]
+
+        for i, (chunk_text, token_count) in enumerate(zip(chunk_texts, token_counts)):
             end_index = current_index + len(chunk_text)
             chunks.append(
                 Chunk(
@@ -71,9 +80,7 @@ def _create_chunks(
                 )
             )
             
-            # we subtract the space taken by the overlapping text, that gives you the start_index for the next chunk
-            overlap_tokens = self.chunk_overlap - (self.chunk_size - len(token_group))
-            current_index = end_index - len(self._decode(token_group[-overlap_tokens:] if overlap_tokens > 0 else []))
+            current_index = end_index - (overlap_tokens_space[i] if self.chunk_overlap > 0 else 0)
         
         return chunks